1#!/usr/bin/env cwl-runner
2### Pipeline to aggregate data from Climatology Lab
3# Copyright (c) 2021-2022. Harvard University
4#
5# Developed by Research Software Engineering,
6# Faculty of Arts and Sciences, Research Computing (FAS RC)
7# Author: Michael A Bouzinier
8#
9# Licensed under the Apache License, Version 2.0 (the "License");
10# you may not use this file except in compliance with the License.
11# You may obtain a copy of the License at
12#
13# http://www.apache.org/licenses/LICENSE-2.0
14#
15# Unless required by applicable law or agreed to in writing, software
16# distributed under the License is distributed on an "AS IS" BASIS,
17# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18# See the License for the specific language governing permissions and
19# limitations under the License.
20#
21
22cwlVersion: v1.2
23class: Workflow
24
25requirements:
26 SubworkflowFeatureRequirement: {}
27 StepInputExpressionRequirement: {}
28 InlineJavascriptRequirement: {}
29 ScatterFeatureRequirement: {}
30 MultipleInputFeatureRequirement: {}
31 NetworkAccess:
32 networkAccess: True
33
34
35doc: |
36 This workflow downloads NetCDF datasets from
37 [University of Idaho Gridded Surface Meteorological Dataset](https://www.northwestknowledge.net/metdata/data/),
38 aggregates gridded data to daily mean values over chosen geographies
39 and optionally ingests it into the database.
40
41 The output of the workflow are gzipped CSV files containing
42 aggregated data.
43
44 Optionally, the aggregated data can be ingested into a database
45 specified in the connection parameters:
46
47 * `database.ini` file containing connection descriptions
48 * `connection_name` a string referring to a section in the `database.ini`
49 file, identifying specific connection to be used.
50
51 The workflow can be invoked either by providing command line options
52 as in the following example:
53
54 toil-cwl-runner --retryCount 1 --cleanWorkDir never \
55 --outdir /scratch/work/exposures/outputs \
56 --workDir /scratch/work/exposures \
57 gridmet.cwl \
58 --database /opt/local/database.ini \
59 --connection_name dorieh \
60 --bands rmin rmax \
61 --strategy auto \
62 --geography zcta \
63 --ram 8GB
64
65 Or, by providing a YaML file (see [example](jobs/test_gridmet_job))
66 with similar options:
67
68 toil-cwl-runner --retryCount 1 --cleanWorkDir never \
69 --outdir /scratch/work/exposures/outputs \
70 --workDir /scratch/work/exposures \
71 gridmet.cwl test_gridmet_job.yml
72
73
74inputs:
75 proxy:
76 type: string?
77 default: ""
78 doc: HTTP/HTTPS Proxy if required
79 shapes:
80 type: Directory?
81 doc: Do we even need this parameter, as we instead downloading shapes?
82 geography:
83 type: string
84 doc: |
85 Type of geography: zip codes or counties
86 Valid values: "zip", "zcta" or "county"
87 years:
88 type: string[]
89 default: ['1999', '2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020']
90 bands:
91 doc: |
92 University of Idaho Gridded Surface Meteorological Dataset
93 [bands](https://developers.google.com/earth-engine/datasets/catalog/IDAHO_EPSCOR_GRIDMET#bands)
94 type: string[]
95 # default: ['bi', 'erc', 'etr', 'fm100', 'fm1000', 'pet', 'pr', 'rmax', 'rmin', 'sph', 'srad', 'th', 'tmmn', 'tmmx', 'vpd', 'vs']
96 strategy:
97 type: string
98 default: auto
99 doc: |
100 [Rasterization strategy](https://nsaph-data-platform.github.io/nsaph-platform-docs/common/gridmet/doc/strategy.html)
101 used for spatial aggregation
102 ram:
103 type: string
104 default: 2GB
105 doc: |
106 Runtime memory, available to the process. When aggregation
107 strategy is `auto`, this value is used to calculate the optimal
108 downscaling factor for the available resources.
109 database:
110 type: File
111 doc: Path to database connection file, usually database.ini
112 connection_name:
113 type: string
114 doc: The name of the section in the database.ini file
115 dates:
116 type: string?
117 doc: 'dates restriction, for testing purposes only'
118 domain:
119 type: string
120 default: climate
121
122
123steps:
124 initdb:
125 run: initdb.cwl
126 doc: Ensure that database utilities are at their latest version
127 in:
128 database: database
129 connection_name: connection_name
130 out:
131 - log
132 - err
133
134 init_db_schema:
135 doc: We need to do it because of parallel creation of tables
136 run:
137 class: CommandLineTool
138 baseCommand: [python, -m, dorieh.platform.util.psql]
139 doc: |
140 This tool executes an SQL statement in the database to grant
141 read privileges to NSAPH users (memebrs of group nsaph_admin)
142 inputs:
143 database:
144 type: File
145 doc: Path to database connection file, usually database.ini
146 inputBinding:
147 prefix: --db
148 connection_name:
149 type: string
150 doc: The name of the section in the database.ini file
151 inputBinding:
152 prefix: --connection
153 domain:
154 type: string
155 #default: climate
156 arguments:
157 - valueFrom: $("CREATE SCHEMA IF NOT EXISTS " + inputs.domain + ';')
158 position: 3
159 outputs:
160 log:
161 type: stdout
162 err:
163 type: stderr
164 stderr: "schema.err"
165 stdout: "schema.log"
166 in:
167 database: database
168 connection_name: connection_name
169 domain: domain
170 out:
171 - log
172 - err
173
174 make_registry:
175 run: build_gridmet_model.cwl
176 doc: Writes down YAML file with the database model
177 in:
178 depends_on: init_db_schema/log
179 domain: domain
180 out:
181 - model
182 - log
183 - errors
184
185 init_tables:
186 doc: creates or recreates database tables, one for each band
187 scatter:
188 - band
189 run:
190 class: Workflow
191 inputs:
192 registry:
193 type: File
194 table:
195 type: string
196 domain:
197 type: string
198 database:
199 type: File
200 connection_name:
201 type: string
202 steps:
203 reset:
204 run: reset.cwl
205 in:
206 registry: registry
207 domain: domain
208 database: database
209 connection_name: connection_name
210 table: table
211 out:
212 - log
213 - errors
214 index:
215 run: index.cwl
216 in:
217 depends_on: reset/log
218 registry: registry
219 domain: domain
220 table: table
221 database: database
222 connection_name: connection_name
223 out: [log, errors]
224 outputs:
225 reset_log:
226 type: File
227 outputSource: reset/log
228 reset_err:
229 type: File
230 outputSource: reset/errors
231 index_log:
232 type: File
233 outputSource: index/log
234 index_err:
235 type: File
236 outputSource: index/errors
237 in:
238 registry: make_registry/model
239 database: database
240 connection_name: connection_name
241 band: bands
242 geography: geography
243 domain: domain
244 table:
245 valueFrom: $(inputs.geography + '_' + inputs.band)
246 out:
247 - reset_log
248 - reset_err
249 - index_log
250 - index_err
251
252 process:
253 run: gridmet_one_file.cwl
254 doc: Downloads raw data and aggregates it over shapes and time
255 scatter:
256 - band
257 - year
258 scatterMethod: nested_crossproduct
259
260 in:
261 proxy: proxy
262 depends_on: init_tables/index_log
263 model: make_registry/model
264 shapes: shapes
265 geography: geography
266 strategy: strategy
267 ram: ram
268 year: years
269 dates: dates
270 band: bands
271 database: database
272 connection_name: connection_name
273 domain: domain
274 months:
275 valueFrom: $([1,2,3,4,5,6,7,8,9,10,11,12])
276 table:
277 valueFrom: $(inputs.geography + '_' + inputs.band)
278
279 out:
280 - download_log
281 - download_err
282 - add_data_aggregate_errors
283 - add_data_data
284 - add_data_aggregate_log
285 - add_data_ingest_log
286 - add_data_ingest_errors
287 - vacuum_log
288 - vacuum_err
289
290 export:
291 run: export.cwl
292 scatter:
293 - band
294 in:
295 depends_on: process/vacuum_log
296 database: database
297 connection_name: connection_name
298 format:
299 valueFrom: "parquet"
300 domain: domain
301 geography: geography
302 band: bands
303 table:
304 valueFrom: $(inputs.domain + '.' + inputs.geography + '_' + inputs.band)
305 partition:
306 valueFrom: $(["year"])
307 output:
308 valueFrom: $('export/' + inputs.domain + '/' + inputs.geography + '_' + inputs.band)
309 out:
310 - data
311 - log
312 - errors
313
314
315
316outputs:
317 initdb_log:
318 type: File?
319 outputSource: initdb/log
320 initdb_err:
321 type: File?
322 outputSource: initdb/err
323
324 init_schema_log:
325 type: File?
326 outputSource: init_db_schema/log
327 init_schema_err:
328 type: File?
329 outputSource: init_db_schema/err
330
331 registry:
332 type: File?
333 outputSource: make_registry/model
334 registry_log:
335 type: File?
336 outputSource: make_registry/log
337 registry_err:
338 type: File?
339 outputSource: make_registry/errors
340
341 data:
342 type:
343 type: array
344 items:
345 type: array
346 items:
347 type: array
348 items: [File]
349 outputSource: process/add_data_data
350 download_log:
351 type:
352 type: array
353 items:
354 type: array
355 items: [File]
356 outputSource: process/download_log
357 download_err:
358 type:
359 type: array
360 items:
361 type: array
362 items: [File]
363 outputSource: process/download_err
364
365 process_log:
366 type:
367 type: array
368 items:
369 type: array
370 items:
371 type: array
372 items: [File]
373 outputSource: process/add_data_aggregate_log
374 process_err:
375 type:
376 type: array
377 items:
378 type: array
379 items:
380 type: array
381 items: [File]
382 outputSource: process/add_data_aggregate_errors
383
384 ingest_log:
385 type:
386 type: array
387 items:
388 type: array
389 items:
390 type: array
391 items: [File]
392 outputSource: process/add_data_ingest_log
393 ingest_err:
394 type:
395 type: array
396 items:
397 type: array
398 items:
399 type: array
400 items: [File]
401 outputSource: process/add_data_ingest_errors
402
403 reset_log:
404 type:
405 type: array
406 items: [File]
407 outputSource: init_tables/reset_log
408 reset_err:
409 type:
410 type: array
411 items: [File]
412 outputSource: init_tables/reset_err
413
414 index_log:
415 type:
416 type: array
417 items: [File]
418 outputSource: init_tables/index_log
419 index_err:
420 type:
421 type: array
422 items: [File]
423 outputSource: init_tables/index_err
424
425 vacuum_log:
426 type:
427 type: array
428 items:
429 type: array
430 items: [File]
431 outputSource: process/vacuum_log
432 vacuum_err:
433 type:
434 type: array
435 items:
436 type: array
437 items: [File]
438 outputSource: process/vacuum_err
439
440 export_data:
441 type:
442 type: array
443 items: ['File', 'Directory']
444 outputSource: export/data
445 export_log:
446 type: File[]
447 outputSource: export/log
448 export_err:
449 type: File[]
450 outputSource: export/errors