Commit 1f2c0182 authored by lukas leufen's avatar lukas leufen

Merge branch 'develop' into 'master'

update to v0.4.0

Closes #23, #27, #14, #26, #24, #25, and #13

See merge request toar/machinelearningtools!17
parents 3b54d872 d70a6b27
Pipeline #26710 passed with stages
in 2 minutes and 34 seconds
......@@ -23,3 +23,6 @@ exclude_lines =
# Don't complain about import statements
import
# Don't complain about abstract class declarations and placeholders
pass
......@@ -47,7 +47,7 @@ tests:
when: always
paths:
- badges/
- test/
- test_results/
coverage:
tags:
......@@ -90,7 +90,7 @@ pages:
- cp -af coverage/. public/coverage
- ls public/coverage
- mkdir -p public/test
- cp -af test/. public/test
- cp -af test_results/. public/test
- ls public/test
- ls public
when: always
......@@ -101,7 +101,7 @@ pages:
- public
- badges/
- coverage/
- test/
- test_results/
cache:
key: old-pages
paths:
......
......@@ -6,14 +6,14 @@ python3 -m pytest --html=report.html --self-contained-html test/ | tee test_resu
IS_FAILED=$?
# move html test report
mkdir test/
mkdir test_results/
BRANCH_NAME=$( echo -e "${CI_COMMIT_REF_NAME////_}")
mkdir test/${BRANCH_NAME}
mkdir test/recent
cp report.html test/${BRANCH_NAME}/.
cp report.html test/recent/.
mkdir test_results/${BRANCH_NAME}
mkdir test_results/recent
cp report.html test_results/${BRANCH_NAME}/.
cp report.html test_results/recent/.
if [[ "${CI_COMMIT_REF_NAME}" = "master" ]]; then
cp -r report.html test/.
cp -r report.html test_results/.
fi
# exit 0 if no tests implemented
......
......@@ -3,145 +3,33 @@ __date__ = '2019-11-14'
import logging
from src.helpers import TimeTracking
from src import helpers
import argparse
import time
from src.modules.experiment_setup import ExperimentSetup
from src.modules import run, PreProcessing, Training, PostProcessing
formatter = "%(asctime)s - %(levelname)s: %(message)s [%(filename)s:%(funcName)s:%(lineno)s]"
logging.basicConfig(level=logging.INFO, format=formatter)
def main():
with run():
exp_setup = ExperimentSetup(args, trainable=True, stations=['DEBW107', 'DEBY081', 'DEBW013', 'DEBW076', 'DEBW087'])
class run(object):
"""
basic run class to measure execution time. Either call this class calling it by 'with' or delete the class instance
after finishing the measurement. The duration result is logged.
"""
def __init__(self):
self.time = TimeTracking()
logging.info(f"{self.__class__.__name__} started")
def __del__(self):
self.time.stop()
logging.info(f"{self.__class__.__name__} finished after {self.time}")
def __enter__(self):
pass
def __exit__(self, exc_type, exc_val, exc_tb):
pass
def do_stuff(self):
time.sleep(2)
class ExperimentSetup:
"""
params:
trainable: Train new model if true, otherwise try to load existing model
"""
def __init__(self, **kwargs):
self.data_path = None
self.experiment_path = None
self.experiment_name = None
self.trainable = None
self.fraction_of_train = None
self.use_all_stations_on_all_data_sets = None
self.network = None
self.var_all_dict = None
self.all_stations = None
self.variables = None
self.dimensions = None
self.dim = None
self.target_dim = None
self.target_var = None
self.setup_experiment(**kwargs)
def _set_param(self, param, value, default=None):
if default is not None:
value = value.get(param, default)
setattr(self, param, value)
logging.info(f"set experiment attribute: {param}={value}")
def setup_experiment(self, **kwargs):
# set data path of this experiment
self._set_param("data_path", helpers.prepare_host())
# set experiment name
exp_date = args.experiment_date
exp_name, exp_path = helpers.set_experiment_name(experiment_date=exp_date)
self._set_param("experiment_name", exp_name)
self._set_param("experiment_path", exp_path)
helpers.check_path_and_create(self.experiment_path)
# set if model is trainable
self._set_param("trainable", kwargs, default=True)
# set fraction of train
self._set_param("fraction_of_train", kwargs, default=0.8)
# use all stations on all data sets (train, val, test)
self._set_param("use_all_stations_on_all_data_sets", kwargs, default=True)
self._set_param("network", kwargs, default="AIRBASE")
self._set_param("var_all_dict", kwargs, default={'o3': 'dma8eu', 'relhum': 'average_values', 'temp': 'maximum',
'u': 'average_values', 'v': 'average_values', 'no': 'dma8eu',
'no2': 'dma8eu', 'cloudcover': 'average_values',
'pblheight': 'maximum'})
self._set_param("all_stations", kwargs, default=['DEBW107', 'DEBY081', 'DEBW013', 'DEBW076', 'DEBW087',
'DEBY052', 'DEBY032', 'DEBW022', 'DEBY004', 'DEBY020',
'DEBW030', 'DEBW037', 'DEBW031', 'DEBW015', 'DEBW073',
'DEBY039', 'DEBW038', 'DEBW081', 'DEBY075', 'DEBW040',
'DEBY053', 'DEBW059', 'DEBW027', 'DEBY072', 'DEBW042',
'DEBW039', 'DEBY001', 'DEBY113', 'DEBY089', 'DEBW024',
'DEBW004', 'DEBY037', 'DEBW056', 'DEBW029', 'DEBY068',
'DEBW010', 'DEBW026', 'DEBY002', 'DEBY079', 'DEBW084',
'DEBY049', 'DEBY031', 'DEBW019', 'DEBW001', 'DEBY063',
'DEBY005', 'DEBW046', 'DEBW103', 'DEBW052', 'DEBW034',
'DEBY088', ])
self._set_param("variables", kwargs, default=list(self.var_all_dict.keys()))
self._set_param("dimensions", kwargs, default={'new_index': ['datetime', 'Stations']})
self._set_param("dim", kwargs, default='datetime')
self._set_param("target_dim", kwargs, default='variables')
self._set_param("target_var", kwargs, default="o3")
class PreProcessing(run):
def __init__(self, setup):
super().__init__()
self.setup = setup
class Training(run):
def __init__(self, setup):
super().__init__()
self.setup = setup
PreProcessing(exp_setup)
class PostProcessing(run):
Training(exp_setup)
def __init__(self, setup):
super().__init__()
self.setup = setup
PostProcessing(exp_setup)
if __name__ == "__main__":
formatter = '%(asctime)s - %(levelname)s: %(message)s [%(filename)s:%(funcName)s:%(lineno)s]'
logging.basicConfig(format=formatter, level=logging.INFO)
parser = argparse.ArgumentParser()
parser.add_argument('--experiment_date', metavar='--exp_date', type=str, nargs=1, default=None,
help="set experiment date as string")
args = parser.parse_args()
with run():
exp_setup = ExperimentSetup(trainable=True)
PreProcessing(exp_setup)
Training(exp_setup)
PostProcessing(exp_setup)
experiment = ExperimentSetup(args, stations=['DEBW107', 'DEBY081', 'DEBW013', 'DEBW076', 'DEBW087'])
a = 1
# main()
......@@ -6,8 +6,6 @@ from src import helpers
from src.data_preparation import DataPrep
import os
from typing import Union, List, Tuple
import decimal
import numpy as np
import xarray as xr
......@@ -20,17 +18,18 @@ class DataGenerator(keras.utils.Sequence):
one entry of integer or string
"""
def __init__(self, path: str, network: str, stations: Union[str, List[str]], variables: List[str],
interpolate_dim: str, target_dim: str, target_var: str, interpolate_method: str = "linear",
limit_nan_fill: int = 1, window_history: int = 7, window_lead_time: int = 4,
transform_method: str = "standardise", **kwargs):
self.path = os.path.abspath(path)
def __init__(self, data_path: str, network: str, stations: Union[str, List[str]], variables: List[str],
interpolate_dim: str, target_dim: str, target_var: str, station_type: str = None,
interpolate_method: str = "linear", limit_nan_fill: int = 1, window_history: int = 7,
window_lead_time: int = 4, transform_method: str = "standardise", **kwargs):
self.data_path = os.path.abspath(data_path)
self.network = network
self.stations = helpers.to_list(stations)
self.variables = variables
self.interpolate_dim = interpolate_dim
self.target_dim = target_dim
self.target_var = target_var
self.station_type = station_type
self.interpolate_method = interpolate_method
self.limit_nan_fill = limit_nan_fill
self.window_history = window_history
......@@ -42,9 +41,10 @@ class DataGenerator(keras.utils.Sequence):
"""
display all class attributes
"""
return f"DataGenerator(path='{self.path}', network='{self.network}', stations={self.stations}, " \
f"variables={self.variables}, interpolate_dim='{self.interpolate_dim}', target_dim='{self.target_dim}'" \
f", target_var='{self.target_var}', **{self.kwargs})"
return f"DataGenerator(path='{self.data_path}', network='{self.network}', stations={self.stations}, " \
f"variables={self.variables}, station_type={self.station_type}, " \
f"interpolate_dim='{self.interpolate_dim}', target_dim='{self.target_dim}', " \
f"target_var='{self.target_var}', **{self.kwargs})"
def __len__(self):
"""
......@@ -96,7 +96,8 @@ class DataGenerator(keras.utils.Sequence):
:return: preprocessed data as a DataPrep instance
"""
station = self.get_station_key(key)
data = DataPrep(self.path, self.network, station, self.variables, **self.kwargs)
data = DataPrep(self.data_path, self.network, station, self.variables, station_type=self.station_type,
**self.kwargs)
data.interpolate(self.interpolate_dim, method=self.interpolate_method, limit=self.limit_nan_fill)
data.transform("datetime", method=self.transform_method)
data.make_history_window(self.interpolate_dim, self.window_history)
......
......@@ -44,11 +44,13 @@ class DataPrep(object):
"""
def __init__(self, path: str, network: str, station: Union[str, List[str]], variables: List[str], **kwargs):
def __init__(self, path: str, network: str, station: Union[str, List[str]], variables: List[str],
station_type: str = None, **kwargs):
self.path = os.path.abspath(path)
self.network = network
self.station = helpers.to_list(station)
self.variables = variables
self.station_type = station_type
self.mean = None
self.std = None
self.history = None
......@@ -75,14 +77,36 @@ class DataPrep(object):
file_name = self._set_file_name()
meta_file = self._set_meta_file_name()
try:
logging.debug(f"try to load local data from: {file_name}")
data = self._slice_prep(xr.open_dataarray(file_name))
self.data = self.check_for_negative_concentrations(data)
self.meta = pd.read_csv(meta_file, index_col=0)
if self.station_type is not None:
self.check_station_meta()
logging.debug("loading finished")
except FileNotFoundError as e:
logging.warning(e)
data, self.meta = self.download_data_from_join(file_name, meta_file)
data = self._slice_prep(data)
self.data = self.check_for_negative_concentrations(data)
logging.debug("loaded new data from JOIN")
def check_station_meta(self):
"""
Search for the entries in meta data and compare the value with the requested values. Raise a FileNotFoundError
if the values mismatch.
"""
check_dict = {
"station_type": self.station_type,
"network_name": self.network
}
for (k, v) in check_dict.items():
if self.meta.at[k, self.station[0]] != v:
logging.debug(f"meta data does not agree which given request for {k}: {v} (requested) != "
f"{self.meta.at[k, self.station[0]]} (local). Raise FileNotFoundError to trigger new "
f"grapping from web.")
raise FileNotFoundError
def download_data_from_join(self, file_name: str, meta_file: str) -> [xr.DataArray, pd.DataFrame]:
"""
......@@ -92,7 +116,8 @@ class DataPrep(object):
:return:
"""
df_all = {}
df, meta = join.download_join(station_name=self.station, statvar=self.statistics_per_var)
df, meta = join.download_join(station_name=self.station, statvar=self.statistics_per_var,
station_type=self.station_type, network_name=self.network)
df_all[self.station[0]] = df
# convert df_all to xarray
xarr = {k: xr.DataArray(v, dims=['datetime', 'variables']) for k, v in df_all.items()}
......@@ -111,7 +136,7 @@ class DataPrep(object):
def __repr__(self):
return f"Dataprep(path='{self.path}', network='{self.network}', station={self.station}, " \
f"variables={self.variables}, **{self.kwargs})"
f"variables={self.variables}, station_type={self.station_type}, **{self.kwargs})"
def interpolate(self, dim: str, method: str = 'linear', limit: int = None,
use_coordinate: Union[bool, str] = True, **kwargs):
......
__author__ = 'Lukas Leufen'
__date__ = '2019-11-22'
from typing import Any, List, Tuple
from abc import ABC
class NameNotFoundInDataStore(Exception):
"""
Exception that get raised if given name is not found in the entire data store.
"""
pass
class NameNotFoundInScope(Exception):
"""
Exception that get raised if given name is not found in the provided scope, but can be found in other scopes.
"""
pass
class EmptyScope(Exception):
"""
Exception that get raised if given scope is not part of the data store.
"""
pass
class AbstractDataStore(ABC):
"""
Data store for all settings for the experiment workflow to save experiment parameters for the proceeding modules
and predefine parameters loaded during the experiment setup phase. The data store is hierarchically structured, so
that global settings can be overwritten by local adjustments.
"""
def __init__(self):
# empty initialise the data-store variables
self._store = {}
def put(self, name: str, obj: Any, scope: str) -> None:
"""
Abstract method to add an object to the data store
:param name: Name of object to store
:param obj: The object itself to be stored
:param scope: the scope / context of the object, under that the object is valid
"""
pass
def get(self, name: str, scope: str) -> None:
"""
Abstract method to get an object from the data store
:param name: Name to look for
:param scope: scope to search the name for
:return: the stored object
"""
pass
def search_name(self, name: str) -> None:
"""
Abstract method to search for all occurrences of given `name` in the entire data store.
:param name: Name to look for
:return: search result
"""
pass
def search_scope(self, scope: str) -> None:
"""
Abstract method to search for all object names that are stored for given scope
:param scope: scope to look for
:return: search result
"""
pass
def list_all_scopes(self) -> None:
"""
Abstract method to list all scopes in data store
:return: all found scopes
"""
pass
def list_all_names(self) -> None:
"""
List all names available in the data store.
:return: all names
"""
pass
def clear_data_store(self) -> None:
self._store = {}
class DataStoreByVariable(AbstractDataStore):
"""
Data store for all settings for the experiment workflow to save experiment parameters for the proceeding modules
and predefine parameters loaded during the experiment setup phase. The data store is hierarchically structured, so
that global settings can be overwritten by local adjustments.
This implementation stores data as
<variable1>
<scope1>: value
<scope2>: value
<variable2>
<scope1>: value
<scope3>: value
"""
def put(self, name: str, obj: Any, scope: str) -> None:
"""
Store an object `obj` with given `name` under `scope`. In the current implementation, existing entries are
overwritten.
:param name: Name of object to store
:param obj: The object itself to be stored
:param scope: the scope / context of the object, under that the object is valid
"""
# open new variable related store with `name` as key if not existing
if name not in self._store.keys():
self._store[name] = {}
self._store[name][scope] = obj
def get(self, name: str, scope: str) -> Any:
"""
Retrieve an object with `name` from `scope`. If no object can be found in the exact scope, take an iterative
look on the levels above. Raises a NameNotFoundInDataStore error, if no object with given name can be found in
the entire data store. Raises a NameNotFoundInScope error, if the object is in the data store but not in the
given scope and its levels above (could be either included in another scope or a more detailed sub-scope).
:param name: Name to look for
:param scope: scope to search the name for
:return: the stored object
"""
return self._stride_through_scopes(name, scope)[2]
def _stride_through_scopes(self, name, scope, depth=0):
if depth <= scope.count("."):
local_scope = scope.rsplit(".", maxsplit=depth)[0]
try:
return name, local_scope, self._store[name][local_scope]
except KeyError:
return self._stride_through_scopes(name, scope, depth + 1)
else:
occurrences = self.search_name(name)
if len(occurrences) == 0:
raise NameNotFoundInDataStore(f"Couldn't find {name} in data store")
else:
raise NameNotFoundInScope(f"Couldn't find {name} in scope {scope} . {name} is only defined in "
f"{occurrences}")
def search_name(self, name: str) -> List[str]:
"""
Search for all occurrences of given `name` in the entire data store.
:param name: Name to look for
:return: list with all scopes and sub-scopes containing an object stored as `name`
"""
return sorted(self._store[name] if name in self._store.keys() else [])
def search_scope(self, scope: str, current_scope_only=True, return_all=False) -> List[str or Tuple]:
"""
Search for given `scope` and list all object names stored under this scope. To look also for all superior scopes
set `current_scope_only=False`. To return the scope and the object's value too, set `return_all=True`.
:param scope: scope to look for
:param current_scope_only: look only for all names for given scope if true, else search for names from superior
scopes too.
:param return_all: return name, definition scope and value if True, else just the name
:return: list with all object names (if `return_all=False`) or list with tuple of object name, object scope and
object value ordered by name (if `return_all=True`)
"""
if current_scope_only:
names = []
for (k, v) in self._store.items():
if scope in v.keys():
names.append(k)
if len(names) > 0:
if return_all:
return sorted([(name, scope, self._store[name][scope]) for name in names], key=lambda tup: tup[0])
else:
return sorted(names)
else:
raise EmptyScope(f"Given scope {scope} is not part of the data store. Available scopes are: "
f"{self.list_all_scopes()}")
else:
results = []
for name in self.list_all_names():
try:
res = self._stride_through_scopes(name, scope)
if return_all:
results.append(res)
else:
results.append(res[0])
except (NameNotFoundInDataStore, NameNotFoundInScope):
pass
if return_all:
return sorted(results, key=lambda tup: tup[0])
else:
return sorted(results)
def list_all_scopes(self) -> List[str]:
"""
List all available scopes in data store
:return: names of all stored objects
"""
scopes = []
for v in self._store.values():
for scope in v.keys():
if scope not in scopes:
scopes.append(scope)
return sorted(scopes)
def list_all_names(self) -> List[str]:
"""
List all names available in the data store.
:return: all names
"""
return sorted(self._store.keys())
class DataStoreByScope(AbstractDataStore):
"""
Data store for all settings for the experiment workflow to save experiment parameters for the proceeding modules
and predefine parameters loaded during the experiment setup phase. The data store is hierarchically structured, so
that global settings can be overwritten by local adjustments.
This implementation stores data as
<scope1>
<variable1>: value
<variable2>: value
<scope2>
<variable1>: value
<variable3>: value
"""
def put(self, name: str, obj: Any, scope: str) -> None:
"""
Store an object `obj` with given `name` under `scope`. In the current implementation, existing entries are
overwritten.
:param name: Name of object to store
:param obj: The object itself to be stored
:param scope: the scope / context of the object, under that the object is valid
"""
if scope not in self._store.keys():
self._store[scope] = {}
self._store[scope][name] = obj
def get(self, name: str, scope: str) -> Any:
"""
Retrieve an object with `name` from `scope`. If no object can be found in the exact scope, take an iterative
look on the levels above. Raises a NameNotFoundInDataStore error, if no object with given name can be found in
the entire data store. Raises a NameNotFoundInScope error, if the object is in the data store but not in the
given scope and its levels above (could be either included in another scope or a more detailed sub-scope).
:param name: Name to look for
:param scope: scope to search the name for
:return: the stored object
"""
return self._stride_through_scopes(name, scope)[2]
def _stride_through_scopes(self, name, scope, depth=0):
if depth <= scope.count("."):
local_scope = scope.rsplit(".", maxsplit=depth)[0]
try:
return name, local_scope, self._store[local_scope][name]
except KeyError:
return self._stride_through_scopes(name, scope, depth + 1)
else:
occurrences = self.search_name(name)
if len(occurrences) == 0:
raise NameNotFoundInDataStore(f"Couldn't find {name} in data store")
else:
raise NameNotFoundInScope(f"Couldn't find {name} in scope {scope} . {name} is only defined in "
f"{occurrences}")
def search_name(self, name: str) -> List[str]:
"""
Search for all occurrences of given `name` in the entire data store.
:param name: Name to look for
:return: list with all scopes and sub-scopes containing an object stored as `name`
"""
keys = []
for (key, val) in self._store.items():
if name in val.keys():
keys.append(key)
return sorted(keys)
def search_scope(self, scope: str, current_scope_only: bool = True, return_all: bool = False) -> List[str or Tuple]:
"""
Search for given `scope` and list all object names stored under this scope. To look also for all superior scopes