Commit c1cb2c8e authored by lukas leufen's avatar lukas leufen

Merge branch 'release_v0.12.0' into 'master'

Resolve "new release v0.12.0"

Closes #149, #171, #170, #169, #168, #162, #159, #154, and #153

See merge request !145
parents d099bd6d 7f16cdbc
Pipeline #46729 passed with stages
in 9 minutes and 44 seconds
......@@ -2,6 +2,23 @@
All notable changes to this project will be documented in this file.
## v0.12.0 - 2020-09-21 - Documentation and Bugfixes
### general:
- improved documentation include installation instructions and many examples from the paper, #153
- bugfixes (see technical)
### new features:
- `MyLittleModel` is now a pure feed-forward network (before it had a CNN part), #168
### technical:
- new compile options check to ensure its execution, #154
- bugfix for key errors in time series plot, #169
- bugfix for not used kwargs in `DefaultDataHandler`, #170
- `trainable` parameter is renamed by `train_model` to prevent confusion with the tf trainable parameter, #162
- fixed HPC installation failure, #159
## v0.11.0 - 2020-08-24 - Advanced Data Handling for MLAir
### general
......@@ -11,7 +28,7 @@ All notable changes to this project will be documented in this file.
### new features
- default data handler using TOAR DB refactored according to advanced data handling, #140, #141, #152
- data sets are handled as collections, #142, and are itable in a standard way (StandardIterator) and optimised for
- data sets are handled as collections, #142, and are iterable in a standard way (StandardIterator) and optimised for
keras (KerasIterator), #143
- automatically moving station map plot, #136
......
......@@ -117,15 +117,17 @@ EOT
fi
echo
echo "You have to run the the following command on a login node to download data:"
echo " \`python run.py'"
echo
echo "###################################################################################"
echo "# You have to run the the following command on a login node to download data: #"
echo "# \`python run_HPC.py' #"
echo "# #"
echo "Please execute the following command to check if the setup went well:"
echo "# Please execute the following command to check if the setup went well: #"
if [[ ${hpcsys} = 'juwels' ]]; then
echo " \`sbatch run_juwels_develgpus.bash'"
echo "# \`sbatch run_juwels_develgpus.bash' #"
else
echo " \`sbatch run_hdfml_batch.bash'"
echo "# \`sbatch run_hdfml_batch.bash' #"
fi
echo "###################################################################################"
......@@ -8,7 +8,7 @@
module --force purge
module use $OTHERSTAGES
ml Stages/Devel-2019a
ml Stages/2019a
ml GCCcore/.8.3.0
ml Jupyter/2019a-Python-3.6.8
......@@ -18,4 +18,4 @@ ml Keras/2.2.4-GPU-Python-3.6.8
ml SciPy-Stack/2019a-Python-3.6.8
ml dask/1.1.5-Python-3.6.8
ml GEOS/3.7.1-Python-3.6.8
ml Graphviz/2.40.1
\ No newline at end of file
ml Graphviz/2.40.1
absl-py==0.9.0
astor==0.8.1
atomicwrites==1.3.0
attrs==19.3.0
certifi==2019.11.28
chardet==3.0.4
cloudpickle==1.3.0
coverage==5.0.3
cycler==0.10.0
Cython==0.29.15
dask==2.11.0
fsspec==0.6.2
gast==0.3.3
grpcio==1.27.2
h5py==2.10.0
idna==2.8
importlib-metadata==1.5.0
matplotlib==3.2.0 # in SciPy-Stack
pandas==1.0.1 # in SciPy-Stack / but older version
py==1.8.1 # ?
pyproj==2.5.0 # in basemap
pyshp==2.1.0 # in basemap
pytest==5.3.5 # in python (but we need higher version)
kiwisolver==1.1.0
locket==0.2.0
Markdown==3.2.1
matplotlib==3.2.0
mock==4.0.1
more-itertools==8.2.0
numpy==1.18.1
packaging==20.3
pandas==1.0.1
partd==1.1.0
patsy==0.5.1
Pillow==7.0.0
pluggy==0.13.1
protobuf==3.11.3
py==1.8.1
pydot==1.4.1
pyparsing==2.4.6
pyproj==2.5.0
pyshp==2.1.0
pytest==5.3.5
pytest-cov==2.8.1
pytest-html==2.0.1
pytest-lazy-fixture==0.6.3
pytest-metadata==1.8.0
pytest-sugar
statsmodels==0.11.1 # (in Jupyter, but not found)
xarray==0.15.0 # in SciPy-Stack only 0.12.1 a
python-dateutil==2.8.1
pytz==2019.3
PyYAML==5.3
requests==2.23.0
scipy==1.4.1
seaborn==0.10.0
--no-binary shapely Shapely==1.7.0
six==1.11.0
statsmodels==0.11.1
tabulate
toolz==0.10.0
typing-extensions
urllib3==1.25.8
wcwidth==0.1.8
Werkzeug==1.0.0
xarray==0.15.0
zipp==3.1.0
......@@ -24,18 +24,15 @@ source ${cur}/../venv_juwels/bin/activate
# export path for side-packages
export PYTHONPATH=${cur}/../venv_juwels/lib/python3.6/site-packages:${PYTHONPATH}
echo "##### START INSTALLING requirements_JUWELS_additionals.txt #####"
pip install -r ${cur}/requirements_JUWELS_additionals.txt
echo "##### FINISH INSTALLING requirements_JUWELS_additionals.txt #####"
pip install -r ${cur}/requirements_JUWELS_additionals.txt
pip install netcdf4
pip install --ignore-installed matplotlib==3.2.0
pip install --ignore-installed pandas==1.0.1
pip install -U typing_extensions
# Comment: Maybe we have to export PYTHONPATH a second time ater activating the venv (after job allocation)
# source venv/bin/activate
# alloc_develgpu
# source venv/bin/activate
# export PYTHONPATH=${PWD}/venv/lib/python3.6/site-packages:${PYTHONPATH}
# srun python run.py
# create batch run scripts
# source create_runscripts_HPC.sh
# MLAir - Machine Learning on Air Data
MLAir (Machine Learning on Air data) is an environment that simplifies and accelerates the creation of new machine
learning (ML) models for the analysis and forecasting of meteorological and air quality time series.
learning (ML) models for the analysis and forecasting of meteorological and air quality time series. You can find the
docs [here](http://toar.pages.jsc.fz-juelich.de/mlair/docs/).
[[_TOC_]]
# Installation
......@@ -9,7 +12,8 @@ MLAir is based on several python frameworks. To work properly, you have to insta
`requirements.txt` file. Additionally to support the geographical plotting part it is required to install geo
packages built for your operating system. Name names of these package may differ for different systems, we refer
here to the opensuse / leap OS. The geo plot can be removed from the `plot_list`, in this case there is no need to
install the geo packages.
install the geo packages. For special instructions to install MLAir on the Juelich HPC systems, see
[here](#special-instructions-for-installation-on-jülich-hpc-systems).
* (geo) Install **proj** on your machine using the console. E.g. for opensuse / leap `zypper install proj`
* (geo) A c++ compiler is required for the installation of the program **cartopy**
......@@ -27,7 +31,9 @@ install the geo packages.
# How to start with MLAir
In this section, we show three examples how to work with MLAir.
In this section, we show three examples how to work with MLAir. Note, that for these examples MLAir was installed using
the distribution file. In case you are using the git clone it is required to adjust the import path if not directly
executed inside the source directory of MLAir.
## Example 1
......@@ -112,12 +118,50 @@ INFO: No training has started, because trainable parameter was false.
INFO: mlair finished after 00:00:06 (hh:mm:ss)
```
# Customised workflows and models
# Custom Workflow
# Default Workflow
MLAir is constituted of so-called `run_modules` which are executed in a distinct order called `workflow`. MLAir
provides a `default_workflow`. This workflow runs the run modules `ExperimentSetup`, `PreProcessing`,
`ModelSetup`, `Training`, and `PostProcessing` one by one.
MLAir provides a default workflow. If additional steps are to be performed, you have to append custom run modules to
the workflow.
![Sketch of the default workflow.](docs/_source/_plots/run_modules_schedule.png)
```python
import mlair
# create your custom MLAir workflow
DefaultWorkflow = mlair.DefaultWorkflow()
# execute default workflow
DefaultWorkflow.run()
```
The output of running this default workflow will be structured like the following.
```log
INFO: mlair started
INFO: ExperimentSetup started
...
INFO: ExperimentSetup finished after 00:00:01 (hh:mm:ss)
INFO: PreProcessing started
...
INFO: PreProcessing finished after 00:00:11 (hh:mm:ss)
INFO: ModelSetup started
...
INFO: ModelSetup finished after 00:00:01 (hh:mm:ss)
INFO: Training started
...
INFO: Training finished after 00:02:15 (hh:mm:ss)
INFO: PostProcessing started
...
INFO: PostProcessing finished after 00:01:37 (hh:mm:ss)
INFO: mlair finished after 00:04:05 (hh:mm:ss)
```
# Customised Run Module and Workflow
It is possible to create new custom run modules. A custom run module is required to inherit from the base class
`RunEnvironment` and to hold the constructor method `__init__()`. This method has to execute the module on call.
In the following example, this is done by using the `_run()` method that is called by the initialiser. It is
possible to parse arguments to the custom run module as shown.
```python
import mlair
......@@ -129,14 +173,19 @@ class CustomStage(mlair.RunEnvironment):
def __init__(self, test_string):
super().__init__() # always call super init method
self._run(test_string) # call a class method
def _run(self, test_string):
logging.info("Just running a custom stage.")
logging.info("test_string = " + test_string)
epochs = self.data_store.get("epochs")
logging.info("epochs = " + str(epochs))
```
If a custom run module is defined, it is required to adjust the workflow. For this, you need to load the empty
`Workflow` class and add each run module that is required. The order of adding modules defines the order of
execution if running the workflow.
```python
# create your custom MLAir workflow
CustomWorkflow = mlair.Workflow()
# provide stages without initialisation
......@@ -146,6 +195,9 @@ CustomWorkflow.add(CustomStage, test_string="Hello World")
# finally execute custom workflow in order of adding
CustomWorkflow.run()
```
The output will look like:
```log
INFO: mlair started
...
......@@ -158,115 +210,198 @@ INFO: CustomStage finished after 00:00:01 (hh:mm:ss)
INFO: mlair finished after 00:00:13 (hh:mm:ss)
```
## Custom Model
# Custom Model
Each model has to inherit from the abstract model class to ensure a smooth training and evaluation behaviour. It is
required to implement the set model and set compile options methods. The later has to set the loss at least.
Create your own model to run your personal experiment. To guarantee a proper integration in the MLAir workflow, models
are restricted to inherit from the `AbstractModelClass`. This will ensure a smooth training and evaluation
behaviour.
```python
## How to create a customised model?
* Create a new model class inheriting from `AbstractModelClass`
```python
from mlair import AbstractModelClass
import keras
from keras.losses import mean_squared_error as mse
from keras.optimizers import SGD
from mlair.model_modules import AbstractModelClass
class MyCustomisedModel(AbstractModelClass):
def __init__(self, shape_inputs: list, shape_outputs: list):
super().__init__(shape_inputs[0], shape_outputs[0])
class MyLittleModel(AbstractModelClass):
"""
A customised model with a 1x1 Conv, and 3 Dense layers (32, 16
window_lead_time). Dropout is used after Conv layer.
"""
def __init__(self, window_history_size, window_lead_time, channels):
super().__init__()
# settings
self.window_history_size = window_history_size
self.window_lead_time = window_lead_time
self.channels = channels
self.dropout_rate = 0.1
self.activation = keras.layers.PReLU
self.lr = 1e-2
# apply to model
self.set_model()
self.set_compile_options()
self.set_custom_objects(loss=self.compile_options['loss'])
```
* Make sure to add the `super().__init__()` and at least `set_model()` and `set_compile_options()` to your
custom init method.
* The shown model expects a single input and output branch provided in a list. Therefore shapes of input and output are
extracted and then provided to the super class initialiser.
* Some general settings like the dropout rate are set in the init method additionally.
* If you have custom objects in your model, that are not part of the keras or tensorflow frameworks, you need to add
them to custom objects. To do this, call `set_custom_objects` with arbitrarily kwargs. In the shown example, the
loss has been added for demonstration only, because we use a build-in loss function. Nonetheless, we always encourage
you to add the loss as custom object, to prevent potential errors when loading an already created model instead of
training a new one.
* Now build your model inside `set_model()` by using the instance attributes `self.shape_inputs` and
`self.shape_outputs` and storing the model as `self.model`.
```python
class MyCustomisedModel(AbstractModelClass):
def set_model(self):
# add 1 to window_size to include current time step t0
shape = (self.window_history_size + 1, 1, self.channels)
x_input = keras.layers.Input(shape=shape)
x_in = keras.layers.Conv2D(32, (1, 1), padding='same')(x_input)
x_input = keras.layers.Input(shape=self.shape_inputs)
x_in = keras.layers.Conv2D(32, (1, 1), padding='same', name='{}_Conv_1x1'.format("major"))(x_input)
x_in = self.activation(name='{}_conv_act'.format("major"))(x_in)
x_in = keras.layers.Flatten(name='{}'.format("major"))(x_in)
x_in = keras.layers.Dropout(self.dropout_rate, name='{}_Dropout_1'.format("major"))(x_in)
x_in = keras.layers.Dense(16, name='{}_Dense_16'.format("major"))(x_in)
x_in = self.activation()(x_in)
x_in = keras.layers.Flatten()(x_in)
x_in = keras.layers.Dropout(self.dropout_rate)(x_in)
x_in = keras.layers.Dense(32)(x_in)
x_in = self.activation()(x_in)
x_in = keras.layers.Dense(16)(x_in)
x_in = self.activation()(x_in)
x_in = keras.layers.Dense(self.window_lead_time)(x_in)
out = self.activation()(x_in)
self.model = keras.Model(inputs=x_input, outputs=[out])
x_in = keras.layers.Dense(self.shape_outputs, name='{}_Dense'.format("major"))(x_in)
out_main = self.activation()(x_in)
self.model = keras.Model(inputs=x_input, outputs=[out_main])
```
* Your are free how to design your model. Just make sure to save it in the class attribute model.
* Additionally, set your custom compile options including the loss definition.
```python
class MyCustomisedModel(AbstractModelClass):
def set_compile_options(self):
self.compile_options = {"optimizer": SGD(lr=self.lr),
"loss": mse,
"metrics": ["mse"]}
self.initial_lr = 1e-2
self.optimizer = keras.optimizers.SGD(lr=self.initial_lr, momentum=0.9)
self.lr_decay = mlair.model_modules.keras_extensions.LearningRateDecay(base_lr=self.initial_lr,
drop=.94,
epochs_drop=10)
self.loss = keras.losses.mean_squared_error
self.compile_options = {"metrics": ["mse", "mae"]}
```
* The allocation of the instance parameters `initial_lr`, `optimizer`, and `lr_decay` could be also part of
the model class' initialiser. The same applies to `self.loss` and `compile_options`, but we recommend to use
the `set_compile_options` method for the definition of parameters, that are related to the compile options.
* More important is that the compile options are actually saved. There are three ways to achieve this.
* (1): Set all compile options by parsing a dictionary with all options to `self.compile_options`.
* (2): Set all compile options as instance attributes. MLAir will search for these attributes and store them.
* (3): Define your compile options partly as dictionary and instance attributes (as shown in this example).
* If using (3) and defining the same compile option with different values, MLAir will raise an error.
Incorrect: (Will raise an error because of a mismatch for the `optimizer` parameter.)
```python
def set_compile_options(self):
self.optimizer = keras.optimizers.SGD()
self.loss = keras.losses.mean_squared_error
self.compile_options = {"optimizer" = keras.optimizers.Adam()}
```
## Specials for Branched Models
* If you have a branched model with multiple outputs, you need either set only a single loss for all branch outputs or
provide the same number of loss functions considering the right order.
## Transformation
```python
class MyCustomisedModel(AbstractModelClass):
There are two different approaches (called scopes) to transform the data:
1) `station`: transform data for each station independently (somehow like batch normalisation)
1) `data`: transform all data of each station with shared metrics
def set_model(self):
...
self.model = keras.Model(inputs=x_input, outputs=[out_minor_1, out_minor_2, out_main])
Transformation must be set by the `transformation` attribute. If `transformation = None` is given to `ExperimentSetup`,
data is not transformed at all. For all other setups, use the following dictionary structure to specify the
transformation.
def set_compile_options(self):
self.loss = [keras.losses.mean_absolute_error] + # for out_minor_1
[keras.losses.mean_squared_error] + # for out_minor_2
[keras.losses.mean_squared_error] # for out_main
```
transformation = {"scope": <...>,
"method": <...>,
"mean": <...>,
"std": <...>}
ExperimentSetup(..., transformation=transformation, ...)
## How to access my customised model?
If the customised model is created, you can easily access the model with
```python
>>> MyCustomisedModel().model
<your custom model>
```
### scopes
The loss is accessible via
**station**: mean and std are not used
```python
>>> MyCustomisedModel().loss
<your custom loss>
```
**data**: either provide already calculated values for mean and std (if required by transformation method), or choose
from different calculation schemes, explained in the mean and std section.
You can treat the instance of your model as instance but also as the model itself. If you call a method, that refers to
the model instead of the model instance, you can directly apply the command on the instance instead of adding the model
parameter call.
### supported transformation methods
Currently supported methods are:
* standardise (default, if method is not given)
* centre
```python
>>> MyCustomisedModel().model.compile(**kwargs) == MyCustomisedModel().compile(**kwargs)
True
```
### mean and std
`"mean"="accurate"`: calculate the accurate values of mean and std (depending on method) by using all data. Although,
this method is accurate, it may take some time for the calculation. Furthermore, this could potentially lead to memory
issue (not explored yet, but could appear for a very big amount of data)
# Data Handlers
`"mean"="estimate"`: estimate mean and std (depending on method). For each station, mean and std are calculated and
afterwards aggregated using the mean value over all station-wise metrics. This method is less accurate, especially
regarding the std calculation but therefore much faster.
Data handlers are responsible for all tasks related to data like data acquisition, preparation and provision. A data
handler must inherit from the abstract base class `AbstractDataHandler` and requires the implementation of the
`__init__()` method and the accessors `get_X()` and `get_Y()`. In the following, we show an example how a custom data
handler could look like.
We recommend to use the later method *estimate* because of following reasons:
* much faster calculation
* real accuracy of mean and std is less important, because it is "just" a transformation / scaling
* accuracy of mean is almost as high as in the *accurate* case, because of
$\bar{x_{ij}} = \bar{\left(\bar{x_i}\right)_j}$. The only difference is, that in the *estimate* case, each mean is
equally weighted for each station independently of the actual data count of the station.
* accuracy of std is lower for *estimate* because of $\var{x_{ij}} \ne \bar{\left(\var{x_i}\right)_j}$, but still the mean of all
station-wise std is a decent estimate of the true std.
```python
import datetime as dt
import numpy as np
import pandas as pd
import xarray as xr
`"mean"=<value, e.g. xr.DataArray>`: If mean and std are already calculated or shall be set manually, just add the
scaling values instead of the calculation method. For method *centre*, std can still be None, but is required for the
*standardise* method. **Important**: Format of given values **must** match internal data format of DataPreparation
class: `xr.DataArray` with `dims=["variables"]` and one value for each variable.
from mlair.data_handler import AbstractDataHandler
class DummyDataHandler(AbstractDataHandler):
def __init__(self, name, number_of_samples=None):
"""This data handler takes a name argument and the number of samples to generate. If not provided, a random
number between 100 and 150 is set."""
super().__init__()
self.name = name
self.number_of_samples = number_of_samples if number_of_samples is not None else np.random.randint(100, 150)
self._X = self.create_X()
self._Y = self.create_Y()
def create_X(self):
"""Inputs are random numbers between 0 and 10 with shape (no_samples, window=14, variables=5)."""
X = np.random.randint(0, 10, size=(self.number_of_samples, 14, 5)) # samples, window, variables
datelist = pd.date_range(dt.datetime.today().date(), periods=self.number_of_samples, freq="H").tolist()
return xr.DataArray(X, dims=['datetime', 'window', 'variables'], coords={"datetime": datelist,
"window": range(14),
"variables": range(5)})
def create_Y(self):
"""Targets are normal distributed random numbers with shape (no_samples, window=5, variables=1)."""
Y = np.round(0.5 * np.random.randn(self.number_of_samples, 5, 1), 1) # samples, window, variables
datelist = pd.date_range(dt.datetime.today().date(), periods=self.number_of_samples, freq="H").tolist()
return xr.DataArray(Y, dims=['datetime', 'window', 'variables'], coords={"datetime": datelist,
"window": range(5),
"variables": range(1)})
def get_X(self, upsampling=False, as_numpy=False):
"""Upsampling parameter is not used for X."""
return np.copy(self._X) if as_numpy is True else self._X
def get_Y(self, upsampling=False, as_numpy=False):
"""Upsampling parameter is not used for Y."""
return np.copy(self._Y) if as_numpy is True else self._Y
def __str__(self):
return self.name
```
# Special Remarks
......@@ -297,3 +432,4 @@ Therefore, it might be necessary to adopt the `if` statement in `PartitionCheck.
add it to `src/join_settings.py` in the hourly data section. Replace the `TOAR_SERVICE_URL` and the `Authorization`
value. To make sure, that this **sensitive** data is not uploaded to the remote server, use the following command to
prevent git from tracking this file: `git update-index --assume-unchanged src/join_settings.py`
#!/bin/csh -x
echo "############################################################"
echo "# #"
echo "# user interaction required #"
echo "# #"
echo "############################################################"
echo
echo "This script creates the HPC batch scripts to run mlt on compute nodes (gpus and develgpus)."
echo "You can modify the created run scripts afterwards if needed."
while true; do
read -p "Do you wish to create run scripts for JUWELS? [yes/no]" yn
case $yn in
[Yy]* ) juwels=True; break;;
[Nn]* ) juwels=False;;
* ) echo "Please answer yes or no.";;
esac
done
while true; do
read -p "Do you wish to create run script for HDFML? [yes/no]" yn
case $yn in
[Yy]* ) hdfml=True; break;;
[Nn]* ) hdfml=False;;
* ) echo "Please answer yes or no.";;
esac
done
budget=''
while [[ $budget == '' ]]
do
echo
read -p "Enter project budget for --account flag: " budget
done
email=`jutil user show -o json | grep email | cut -f2 -d':' | cut -f1 -d',' | cut -f2 -d'"'`
echo
read -p "Enter e-mail address for --mail-user (default: ${email}): " new_email
if [[ -z "$new_email" ]]; then
new_email=$email
fi
# create HPC_logging dir
hpclogging="../HPC_logging/"
mkdir -p $hpclogging
# ordering for looping:
# "partition nGPUs timing"
if [[ $juwels == True ]]; then
for i in "develgpus 2 02:00:00" "gpus 4 08:00:00"; do
set -- $i
cat <<EOT > run_$1.bash
#!/bin/bash -x
#SBATCH --account=${budget}
#SBATCH --nodes=1
#SBATCH --output=${hpclogging}mlt-out.%j
#SBATCH --error=${hpclogging}/mlt-err.%j
#SBATCH --time=$3
#SBATCH --partition=$1
#SBATCH --gres=gpu:$2
#SBATCH --mail-type=ALL
#SBATCH --mail-user=${email}
source mlt_modules_.sh
source venv/bin/activate
timestamp=\`date +"%Y-%m-%d_%H%M-%S"\`