Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
M
MLAir
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
76
Issues
76
List
Boards
Labels
Service Desk
Milestones
Merge Requests
3
Merge Requests
3
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Operations
Operations
Metrics
Incidents
Environments
Packages & Registries
Packages & Registries
Container Registry
Analytics
Analytics
CI / CD
Repository
Value Stream
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
toar
MLAir
Commits
0e8e07f6
Commit
0e8e07f6
authored
Nov 23, 2020
by
lukas leufen
Browse files
Options
Browse Files
Download
Plain Diff
Merge branch 'lukas_issue164_feat_parallel-station-check' into 'develop'
Resolve "Parallel station check" See merge request
!195
parents
60048b9d
f74de303
Pipeline
#52947
passed with stages
in 12 minutes and 46 seconds
Changes
5
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
163 additions
and
1 deletion
+163
-1
.gitignore
.gitignore
+7
-0
mlair/run_modules/pre_processing.py
mlair/run_modules/pre_processing.py
+61
-0
run_hourly.py
run_hourly.py
+11
-0
supplement/station_list_north_german_plain.json
supplement/station_list_north_german_plain.json
+81
-0
test/test_run_modules/test_pre_processing.py
test/test_run_modules/test_pre_processing.py
+3
-1
No files found.
.gitignore
View file @
0e8e07f6
...
...
@@ -42,6 +42,13 @@ ehthumbs.db
Thumbs.db
.idea/
/venv/
/venv*/
/build/
# ignore HPC related skripts #
##############################
run_*_develgpus.bash
run_*_gpus.bash
# don't check data and plot folder #
####################################
...
...
mlair/run_modules/pre_processing.py
View file @
0e8e07f6
...
...
@@ -6,6 +6,8 @@ __date__ = '2019-11-25'
import
logging
import
os
from
typing
import
Tuple
import
multiprocessing
import
requests
import
numpy
as
np
import
pandas
as
pd
...
...
@@ -201,6 +203,50 @@ class PreProcessing(RunEnvironment):
Valid means, that there is data available for the given time range (is included in `kwargs`). The shape and the
loading time are logged in debug mode.
:return: Corrected list containing only valid station IDs.
"""
t_outer
=
TimeTracking
()
logging
.
info
(
f
"check valid stations started
{
' (%s)'
%
(
set_name
if
set_name
is
not
None
else
'all'
)
}
"
)
# calculate transformation using train data
if
set_name
==
"train"
:
logging
.
info
(
"setup transformation using train data exclusively"
)
self
.
transformation
(
data_handler
,
set_stations
)
# start station check
collection
=
DataCollection
()
valid_stations
=
[]
kwargs
=
self
.
data_store
.
create_args_dict
(
data_handler
.
requirements
(),
scope
=
set_name
)
if
multiprocessing
.
cpu_count
()
>
1
:
# parallel solution
logging
.
info
(
"use parallel validate station approach"
)
pool
=
multiprocessing
.
Pool
()
output
=
[
pool
.
apply_async
(
f_proc
,
args
=
(
data_handler
,
station
,
set_name
,
store_processed_data
),
kwds
=
kwargs
)
for
station
in
set_stations
]
for
p
in
output
:
dh
,
s
=
p
.
get
()
if
dh
is
not
None
:
collection
.
add
(
dh
)
valid_stations
.
append
(
s
)
else
:
# serial solution
logging
.
info
(
"use serial validate station approach"
)
for
station
in
set_stations
:
dh
,
s
=
f_proc
(
data_handler
,
station
,
set_name
,
store_processed_data
,
**
kwargs
)
if
dh
is
not
None
:
collection
.
add
(
dh
)
valid_stations
.
append
(
s
)
logging
.
info
(
f
"run for
{
t_outer
}
to check
{
len
(
set_stations
)
}
station(s). Found
{
len
(
collection
)
}
/"
f
"
{
len
(
set_stations
)
}
valid stations."
)
return
collection
,
valid_stations
def
validate_station_old
(
self
,
data_handler
:
AbstractDataHandler
,
set_stations
,
set_name
=
None
,
store_processed_data
=
True
):
"""
Check if all given stations in `all_stations` are valid.
Valid means, that there is data available for the given time range (is included in `kwargs`). The shape and the
loading time are logged in debug mode.
:return: Corrected list containing only valid station IDs.
"""
t_outer
=
TimeTracking
()
...
...
@@ -231,3 +277,18 @@ class PreProcessing(RunEnvironment):
transformation_dict
=
data_handler
.
transformation
(
stations
,
**
kwargs
)
if
transformation_dict
is
not
None
:
self
.
data_store
.
set
(
"transformation"
,
transformation_dict
)
def
f_proc
(
data_handler
,
station
,
name_affix
,
store
,
**
kwargs
):
"""
Try to create a data handler for given arguments. If build fails, this station does not fulfil all requirements and
therefore f_proc will return None as indication. On a successfull build, f_proc returns the built data handler and
the station that was used. This function must be implemented globally to work together with multiprocessing.
"""
try
:
res
=
data_handler
.
build
(
station
,
name_affix
=
name_affix
,
store_processed_data
=
store
,
**
kwargs
)
except
(
AttributeError
,
EmptyQueryResult
,
KeyError
,
requests
.
ConnectionError
)
as
e
:
logging
.
info
(
f
"remove station
{
station
}
because it raised an error:
{
e
}
"
)
res
=
None
return
res
,
station
run_hourly.py
View file @
0e8e07f6
...
...
@@ -6,6 +6,17 @@ import argparse
from
mlair.workflows
import
DefaultWorkflow
def
load_stations
():
import
json
try
:
filename
=
'supplement/station_list_north_german_plain.json'
with
open
(
filename
,
'r'
)
as
jfile
:
stations
=
json
.
load
(
jfile
)
except
FileNotFoundError
:
stations
=
None
return
stations
def
main
(
parser_args
):
workflow
=
DefaultWorkflow
(
sampling
=
"hourly"
,
window_history_size
=
48
,
**
parser_args
.
__dict__
)
...
...
supplement/station_list_north_german_plain.json
0 → 100644
View file @
0e8e07f6
[
"DENI031"
,
"DESH016"
,
"DEBB050"
,
"DEHH022"
,
"DEHH049"
,
"DEHH021"
,
"DEMV007"
,
"DESH015"
,
"DEBE062"
,
"DEHH012"
,
"DESH004"
,
"DENI062"
,
"DEBE051"
,
"DEHH011"
,
"DEHH023"
,
"DEUB020"
,
"DESH005"
,
"DEBB039"
,
"DEHH050"
,
"DENI029"
,
"DESH001"
,
"DEBE001"
,
"DEHH030"
,
"DEHH018"
,
"DEUB022"
,
"DEBB038"
,
"DEBB053"
,
"DEMV017"
,
"DENI063"
,
"DENI058"
,
"DESH014"
,
"DEUB007"
,
"DEUB005"
,
"DEBB051"
,
"DEUB034"
,
"DEST089"
,
"DEHH005"
,
"DESH003"
,
"DEUB028"
,
"DESH017"
,
"DEUB030"
,
"DEMV012"
,
"DENI052"
,
"DENI059"
,
"DENI060"
,
"DESH013"
,
"DEUB006"
,
"DEMV018"
,
"DEUB027"
,
"DEUB026"
,
"DEUB038"
,
"DEMV001"
,
"DEUB024"
,
"DEUB037"
,
"DESH008"
,
"DEMV004"
,
"DEUB040"
,
"DEMV024"
,
"DEMV026"
,
"DESH056"
,
"DEHH063"
,
"DEUB001"
,
"DEST069"
,
"DEBB040"
,
"DEBB028"
,
"DEBB048"
,
"DEBB063"
,
"DEBB067"
,
"DESH006"
,
"DEBE008"
,
"DESH012"
,
"DEHH004"
,
"DEBE009"
,
"DEHH007"
,
"DEBE005"
,
"DEHH057"
,
"DEHH047"
,
"DEBE006"
,
"DEBB110"
]
test/test_run_modules/test_pre_processing.py
View file @
0e8e07f6
import
logging
import
pytest
import
mock
from
mlair.data_handler
import
DefaultDataHandler
,
DataCollection
,
AbstractDataHandler
from
mlair.helpers.datastore
import
NameNotFoundInScope
...
...
@@ -34,7 +35,8 @@ class TestPreProcessing:
yield
pre
RunEnvironment
().
__del__
()
def
test_init
(
self
,
caplog
):
@
mock
.
patch
(
"multiprocessing.cpu_count"
,
return_value
=
1
)
def
test_init
(
self
,
mock_cpu
,
caplog
):
ExperimentSetup
(
stations
=
[
'DEBW107'
,
'DEBY081'
,
'DEBW013'
,
'DEBW076'
,
'DEBW087'
],
statistics_per_var
=
{
'o3'
:
'dma8eu'
,
'temp'
:
'maximum'
})
caplog
.
clear
()
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment