1#!/usr/bin/env cwl-runner
2### Pipeline to aggregate data from Climatology Lab
3# Copyright (c) 2021-2022. Harvard University
4#
5# Developed by Research Software Engineering,
6# Faculty of Arts and Sciences, Research Computing (FAS RC)
7# Author: Michael A Bouzinier
8#
9# Licensed under the Apache License, Version 2.0 (the "License");
10# you may not use this file except in compliance with the License.
11# You may obtain a copy of the License at
12#
13# http://www.apache.org/licenses/LICENSE-2.0
14#
15# Unless required by applicable law or agreed to in writing, software
16# distributed under the License is distributed on an "AS IS" BASIS,
17# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18# See the License for the specific language governing permissions and
19# limitations under the License.
20#
21
22cwlVersion: v1.2
23class: Workflow
24
25requirements:
26 SubworkflowFeatureRequirement: {}
27 StepInputExpressionRequirement: {}
28 InlineJavascriptRequirement: {}
29 ScatterFeatureRequirement: {}
30 MultipleInputFeatureRequirement: {}
31 NetworkAccess:
32 networkAccess: True
33
34
35doc: |
36 This workflow downloads NetCDF datasets from
37 [University of Idaho Gridded Surface Meteorological Dataset](https://www.northwestknowledge.net/metdata/data/),
38 aggregates gridded data to daily mean values over chosen geographies
39 and optionally ingests it into the database.
40
41 The output of the workflow are gzipped CSV files containing
42 aggregated data.
43
44 Optionally, the aggregated data can be ingested into a database
45 specified in the connection parameters:
46
47 * `database.ini` file containing connection descriptions
48 * `connection_name` a string referring to a section in the `database.ini`
49 file, identifying specific connection to be used.
50
51 The workflow can be invoked either by providing command line options
52 as in the following example:
53
54 toil-cwl-runner --retryCount 1 --cleanWorkDir never \
55 --outdir /scratch/work/exposures/outputs \
56 --workDir /scratch/work/exposures \
57 gridmet.cwl \
58 --database /opt/local/database.ini \
59 --connection_name dorieh \
60 --bands rmin rmax \
61 --strategy auto \
62 --geography zcta \
63 --ram 8GB
64
65 Or, by providing a YaML file (see [example](jobs/test_gridmet_job))
66 with similar options:
67
68 toil-cwl-runner --retryCount 1 --cleanWorkDir never \
69 --outdir /scratch/work/exposures/outputs \
70 --workDir /scratch/work/exposures \
71 gridmet.cwl test_gridmet_job.yml
72
73
74inputs:
75 proxy:
76 type: string?
77 default: ""
78 doc: HTTP/HTTPS Proxy if required
79 shapes:
80 type: Directory?
81 doc: Do we even need this parameter, as we instead downloading shapes?
82 geography:
83 type: string
84 doc: |
85 Type of geography: zip codes or counties
86 Valid values: "zip", "zcta" or "county"
87 years:
88 type: string[]
89 default: ['1999', '2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020']
90 bands:
91 doc: |
92 University of Idaho Gridded Surface Meteorological Dataset
93 [bands](https://developers.google.com/earth-engine/datasets/catalog/IDAHO_EPSCOR_GRIDMET#bands)
94 type: string[]
95 # default: ['bi', 'erc', 'etr', 'fm100', 'fm1000', 'pet', 'pr', 'rmax', 'rmin', 'sph', 'srad', 'th', 'tmmn', 'tmmx', 'vpd', 'vs']
96 strategy:
97 type: string
98 default: auto
99 doc: |
100 [Rasterization strategy](https://nsaph-data-platform.github.io/nsaph-platform-docs/common/gridmet/doc/strategy.html)
101 used for spatial aggregation
102 ram:
103 type: string
104 default: 2GB
105 doc: |
106 Runtime memory, available to the process. When aggregation
107 strategy is `auto`, this value is used to calculate the optimal
108 downscaling factor for the available resources.
109 database:
110 type: File
111 doc: Path to database connection file, usually database.ini
112 connection_name:
113 type: string
114 doc: The name of the section in the database.ini file
115 dates:
116 type: string?
117 doc: 'dates restriction, for testing purposes only'
118 domain:
119 type: string
120 default: climate
121
122
123steps:
124 initdb:
125 run: initdb.cwl
126 doc: Ensure that database utilities are at their latest version
127 in:
128 database: database
129 connection_name: connection_name
130 out:
131 - log
132 - err
133
134 init_db_schema:
135 doc: We need to do it because of parallel creation of tables
136 run:
137 class: CommandLineTool
138 baseCommand: [python, -m, dorieh.platform.util.psql]
139 doc: |
140 This tool executes an SQL statement in the database to grant
141 read privileges to NSAPH users (memebrs of group nsaph_admin)
142 inputs:
143 database:
144 type: File
145 doc: Path to database connection file, usually database.ini
146 inputBinding:
147 prefix: --db
148 connection_name:
149 type: string
150 doc: The name of the section in the database.ini file
151 inputBinding:
152 prefix: --connection
153 domain:
154 type: string
155 #default: climate
156 arguments:
157 - valueFrom: $("CREATE SCHEMA IF NOT EXISTS " + inputs.domain + ';')
158 position: 3
159 outputs:
160 log:
161 type: stdout
162 err:
163 type: stderr
164 stderr: "schema.err"
165 stdout: "schema.log"
166 in:
167 database: database
168 connection_name: connection_name
169 domain: domain
170 out:
171 - log
172 - err
173
174 make_registry:
175 run: build_gridmet_model.cwl
176 doc: Writes down YAML file with the database model
177 in:
178 depends_on: init_db_schema/log
179 domain: domain
180 out:
181 - model
182 - log
183 - errors
184
185 init_tables:
186 doc: creates or recreates database tables, one for each band
187 scatter:
188 - band
189 run:
190 class: Workflow
191 inputs:
192 registry:
193 type: File
194 table:
195 type: string
196 domain:
197 type: string
198 database:
199 type: File
200 connection_name:
201 type: string
202 steps:
203 reset:
204 run: reset.cwl
205 in:
206 registry: registry
207 domain: domain
208 database: database
209 connection_name: connection_name
210 table: table
211 out:
212 - log
213 - errors
214 index:
215 run: index.cwl
216 in:
217 depends_on: reset/log
218 registry: registry
219 domain: domain
220 table: table
221 database: database
222 connection_name: connection_name
223 out: [log, errors]
224 outputs:
225 reset_log:
226 type: File
227 outputSource: reset/log
228 reset_err:
229 type: File
230 outputSource: reset/errors
231 index_log:
232 type: File
233 outputSource: index/log
234 index_err:
235 type: File
236 outputSource: index/errors
237 in:
238 registry: make_registry/model
239 database: database
240 connection_name: connection_name
241 band: bands
242 geography: geography
243 domain: domain
244 table:
245 valueFrom: $(inputs.geography + '_' + inputs.band)
246 out:
247 - reset_log
248 - reset_err
249 - index_log
250 - index_err
251
252 process:
253 run: gridmet_one_file.cwl
254 doc: Downloads raw data and aggregates it over shapes and time
255 scatter:
256 - band
257 - year
258 scatterMethod: nested_crossproduct
259
260 in:
261 proxy: proxy
262 depends_on: init_tables/index_log
263 model: make_registry/model
264 shapes: shapes
265 geography: geography
266 strategy: strategy
267 ram: ram
268 year: years
269 dates: dates
270 band: bands
271 database: database
272 connection_name: connection_name
273 domain: domain
274 months:
275 valueFrom: $([1,2,3,4,5,6,7,8,9,10,11,12])
276 table:
277 valueFrom: $(inputs.geography + '_' + inputs.band)
278
279 out:
280 - download_log
281 - download_err
282 - add_data_aggregate_errors
283 - add_data_data
284 - add_data_aggregate_log
285 - add_data_ingest_log
286 - add_data_ingest_errors
287 - vacuum_log
288 - vacuum_err
289
290outputs:
291 initdb_log:
292 type: File?
293 outputSource: initdb/log
294 initdb_err:
295 type: File?
296 outputSource: initdb/err
297
298 init_schema_log:
299 type: File?
300 outputSource: init_db_schema/log
301 init_schema_err:
302 type: File?
303 outputSource: init_db_schema/err
304
305 registry:
306 type: File?
307 outputSource: make_registry/model
308 registry_log:
309 type: File?
310 outputSource: make_registry/log
311 registry_err:
312 type: File?
313 outputSource: make_registry/errors
314
315 data:
316 type:
317 type: array
318 items:
319 type: array
320 items:
321 type: array
322 items: [File]
323 outputSource: process/add_data_data
324 download_log:
325 type:
326 type: array
327 items:
328 type: array
329 items: [File]
330 outputSource: process/download_log
331 download_err:
332 type:
333 type: array
334 items:
335 type: array
336 items: [File]
337 outputSource: process/download_err
338
339 process_log:
340 type:
341 type: array
342 items:
343 type: array
344 items:
345 type: array
346 items: [File]
347 outputSource: process/add_data_aggregate_log
348 process_err:
349 type:
350 type: array
351 items:
352 type: array
353 items:
354 type: array
355 items: [File]
356 outputSource: process/add_data_aggregate_errors
357
358 ingest_log:
359 type:
360 type: array
361 items:
362 type: array
363 items:
364 type: array
365 items: [File]
366 outputSource: process/add_data_ingest_log
367 ingest_err:
368 type:
369 type: array
370 items:
371 type: array
372 items:
373 type: array
374 items: [File]
375 outputSource: process/add_data_ingest_errors
376
377 reset_log:
378 type:
379 type: array
380 items: [File]
381 outputSource: init_tables/reset_log
382 reset_err:
383 type:
384 type: array
385 items: [File]
386 outputSource: init_tables/reset_err
387
388 index_log:
389 type:
390 type: array
391 items: [File]
392 outputSource: init_tables/index_log
393 index_err:
394 type:
395 type: array
396 items: [File]
397 outputSource: init_tables/index_err
398
399 vacuum_log:
400 type:
401 type: array
402 items:
403 type: array
404 items: [File]
405 outputSource: process/vacuum_log
406 vacuum_err:
407 type:
408 type: array
409 items:
410 type: array
411 items: [File]
412 outputSource: process/vacuum_err