Commit 2258f373 authored by lukas leufen's avatar lukas leufen

added example for data handlers

parent 27734c2e
Pipeline #45977 passed with stages
in 6 minutes and 40 seconds
......@@ -348,6 +348,62 @@ parameter call.
True
```
# Data Handlers
Data handlers are responsible for all tasks related to data like data acquisition, preparation and provision. A data
handler must inherit from the abstract base class `AbstractDataHandler` and requires the implementation of the
`__init__()` method and the accessors `get_X()` and `get_Y()`. In the following, we show an example how a custom data
handler could look like.
```python
import datetime as dt
import numpy as np
import pandas as pd
import xarray as xr
from mlair.data_handler import AbstractDataHandler
class DummyDataHandler(AbstractDataHandler):
def __init__(self, name, number_of_samples=None):
"""This data handler takes a name argument and the number of samples to generate. If not provided, a random
number between 100 and 150 is set."""
super().__init__()
self.name = name
self.number_of_samples = number_of_samples if number_of_samples is not None else np.random.randint(100, 150)
self._X = self.create_X()
self._Y = self.create_Y()
def create_X(self):
"""Inputs are random numbers between 0 and 10 with shape (no_samples, window=14, variables=5)."""
X = np.random.randint(0, 10, size=(self.number_of_samples, 14, 5)) # samples, window, variables
datelist = pd.date_range(dt.datetime.today().date(), periods=self.number_of_samples, freq="H").tolist()
return xr.DataArray(X, dims=['datetime', 'window', 'variables'], coords={"datetime": datelist,
"window": range(14),
"variables": range(5)})
def create_Y(self):
"""Targets are normal distributed random numbers with shape (no_samples, window=5, variables=1)."""
Y = np.round(0.5 * np.random.randn(self.number_of_samples, 5, 1), 1) # samples, window, variables
datelist = pd.date_range(dt.datetime.today().date(), periods=self.number_of_samples, freq="H").tolist()
return xr.DataArray(Y, dims=['datetime', 'window', 'variables'], coords={"datetime": datelist,
"window": range(5),
"variables": range(1)})
def get_X(self, upsampling=False, as_numpy=False):
"""Upsampling parameter is not used for X."""
return np.copy(self._X) if as_numpy is True else self._X
def get_Y(self, upsampling=False, as_numpy=False):
"""Upsampling parameter is not used for Y."""
return np.copy(self._Y) if as_numpy is True else self._Y
def __str__(self):
return self.name
```
# Special Remarks
## Special instructions for installation on Jülich HPC systems
......
......@@ -305,15 +305,15 @@ class DefaultDataHandler(AbstractDataHandler):
def run_data_prep():
from .data_preparation_neighbors import DataHandlerNeighbors
data = DummyDataSingleStation("main_class")
data = DummyDataHandler("main_class")
data.get_X()
data.get_Y()
path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "testdata")
data_prep = DataHandlerNeighbors(DummyDataSingleStation("main_class"),
data_prep = DataHandlerNeighbors(DummyDataHandler("main_class"),
path,
neighbors=[DummyDataSingleStation("neighbor1"),
DummyDataSingleStation("neighbor2")],
neighbors=[DummyDataHandler("neighbor1"),
DummyDataHandler("neighbor2")],
extreme_values=[1., 1.2])
data_prep.get_data(upsampling=False)
......@@ -344,6 +344,45 @@ def create_data_prep():
return data_prep
class DummyDataHandler(AbstractDataHandler):
def __init__(self, name, number_of_samples=None):
"""This data handler takes a name argument and the number of samples to generate. If not provided, a random
number between 100 and 150 is set."""
super().__init__()
self.name = name
self.number_of_samples = number_of_samples if number_of_samples is not None else np.random.randint(100, 150)
self._X = self.create_X()
self._Y = self.create_Y()
def create_X(self):
"""Inputs are random numbers between 0 and 10 with shape (no_samples, window=14, variables=5)."""
X = np.random.randint(0, 10, size=(self.number_of_samples, 14, 5)) # samples, window, variables
datelist = pd.date_range(dt.datetime.today().date(), periods=self.number_of_samples, freq="H").tolist()
return xr.DataArray(X, dims=['datetime', 'window', 'variables'], coords={"datetime": datelist,
"window": range(14),
"variables": range(5)})
def create_Y(self):
"""Targets are normal distributed random numbers with shape (no_samples, window=5, variables=1)."""
Y = np.round(0.5 * np.random.randn(self.number_of_samples, 5, 1), 1) # samples, window, variables
datelist = pd.date_range(dt.datetime.today().date(), periods=self.number_of_samples, freq="H").tolist()
return xr.DataArray(Y, dims=['datetime', 'window', 'variables'], coords={"datetime": datelist,
"window": range(5),
"variables": range(1)})
def get_X(self, upsampling=False, as_numpy=False):
"""Upsampling parameter is not used for X."""
return np.copy(self._X) if as_numpy is True else self._X
def get_Y(self, upsampling=False, as_numpy=False):
"""Upsampling parameter is not used for Y."""
return np.copy(self._Y) if as_numpy is True else self._Y
def __str__(self):
return self.name
if __name__ == "__main__":
from mlair.data_handler.station_preparation import DataHandlerSingleStation
from mlair.data_handler.iterator import KerasIterator, DataCollection
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment