gridmet.cwl

  1#!/usr/bin/env cwl-runner
  2### Pipeline to aggregate data from Climatology Lab
  3#  Copyright (c) 2021-2022. Harvard University
  4#
  5#  Developed by Research Software Engineering,
  6#  Faculty of Arts and Sciences, Research Computing (FAS RC)
  7#  Author: Michael A Bouzinier
  8#
  9#  Licensed under the Apache License, Version 2.0 (the "License");
 10#  you may not use this file except in compliance with the License.
 11#  You may obtain a copy of the License at
 12#
 13#         http://www.apache.org/licenses/LICENSE-2.0
 14#
 15#  Unless required by applicable law or agreed to in writing, software
 16#  distributed under the License is distributed on an "AS IS" BASIS,
 17#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 18#  See the License for the specific language governing permissions and
 19#  limitations under the License.
 20#
 21
 22cwlVersion: v1.2
 23class: Workflow
 24
 25requirements:
 26  SubworkflowFeatureRequirement: {}
 27  StepInputExpressionRequirement: {}
 28  InlineJavascriptRequirement: {}
 29  ScatterFeatureRequirement: {}
 30  MultipleInputFeatureRequirement: {}
 31  NetworkAccess:
 32    networkAccess: True
 33
 34
 35doc: |
 36  This workflow downloads NetCDF datasets from 
 37  [University of Idaho Gridded Surface Meteorological Dataset](https://www.northwestknowledge.net/metdata/data/), 
 38  aggregates gridded data to daily mean values over chosen geographies
 39  and optionally ingests it into the database.
 40  
 41  The output of the workflow are gzipped CSV files containing
 42  aggregated data. 
 43  
 44  Optionally, the aggregated data can be ingested into a database
 45  specified in the connection parameters:
 46  
 47  * `database.ini` file containing connection descriptions
 48  * `connection_name` a string referring to a section in the `database.ini`
 49     file, identifying specific connection to be used.
 50
 51  The workflow can be invoked either by providing command line options 
 52  as in the following example:
 53  
 54      toil-cwl-runner --retryCount 1 --cleanWorkDir never \ 
 55          --outdir /scratch/work/exposures/outputs \ 
 56          --workDir /scratch/work/exposures \
 57          gridmet.cwl \  
 58          --database /opt/local/database.ini \ 
 59          --connection_name dorieh \ 
 60          --bands rmin rmax \ 
 61          --strategy auto \ 
 62          --geography zcta \ 
 63          --ram 8GB
 64
 65  Or, by providing a YaML file (see [example](jobs/test_gridmet_job)) 
 66  with similar options:
 67  
 68      toil-cwl-runner --retryCount 1 --cleanWorkDir never \ 
 69          --outdir /scratch/work/exposures/outputs \ 
 70          --workDir /scratch/work/exposures \
 71          gridmet.cwl test_gridmet_job.yml 
 72  
 73
 74inputs:
 75  proxy:
 76    type: string?
 77    default: ""
 78    doc: HTTP/HTTPS Proxy if required
 79  shapes:
 80    type: Directory?
 81    doc: Do we even need this parameter, as we instead downloading shapes?
 82  geography:
 83    type: string
 84    doc: |
 85      Type of geography: zip codes or counties
 86      Valid values: "zip", "zcta" or "county"
 87  years:
 88    type: string[]
 89    default: ['1999', '2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020']
 90  bands:
 91    doc: |
 92      University of Idaho Gridded Surface Meteorological Dataset 
 93      [bands](https://developers.google.com/earth-engine/datasets/catalog/IDAHO_EPSCOR_GRIDMET#bands)
 94    type: string[]
 95    # default: ['bi', 'erc', 'etr', 'fm100', 'fm1000', 'pet', 'pr', 'rmax', 'rmin', 'sph', 'srad', 'th', 'tmmn', 'tmmx', 'vpd', 'vs']
 96  strategy:
 97    type: string
 98    default: auto
 99    doc: |
100      [Rasterization strategy](https://nsaph-data-platform.github.io/nsaph-platform-docs/common/gridmet/doc/strategy.html)
101      used for spatial aggregation
102  ram:
103    type: string
104    default: 2GB
105    doc: |
106      Runtime memory, available to the process. When aggregation
107      strategy is `auto`, this value is used to calculate the optimal
108      downscaling factor for the available resources. 
109  database:
110    type: File
111    doc: Path to database connection file, usually database.ini
112  connection_name:
113    type: string
114    doc: The name of the section in the database.ini file
115  dates:
116    type: string?
117    doc: 'dates restriction, for testing purposes only'
118  domain:
119    type: string
120    default: climate
121
122
123steps:
124  initdb:
125    run: initdb.cwl
126    doc: Ensure that database utilities are at their latest version
127    in:
128      database: database
129      connection_name: connection_name
130    out:
131      - log
132      - err
133
134  init_db_schema:
135    doc: We need to do it because of parallel creation of tables
136    run:
137      class: CommandLineTool
138      baseCommand: [python, -m, dorieh.platform.util.psql]
139      doc: |
140        This tool executes an SQL statement in the database to grant
141        read privileges to NSAPH users (memebrs of group nsaph_admin)
142      inputs:
143        database:
144          type: File
145          doc: Path to database connection file, usually database.ini
146          inputBinding:
147            prefix: --db
148        connection_name:
149          type: string
150          doc: The name of the section in the database.ini file
151          inputBinding:
152            prefix: --connection
153        domain:
154          type: string
155          #default: climate
156      arguments:
157        - valueFrom: $("CREATE SCHEMA IF NOT EXISTS " + inputs.domain + ';')
158          position: 3
159      outputs:
160        log:
161          type: stdout
162        err:
163          type: stderr
164      stderr: "schema.err"
165      stdout: "schema.log"
166    in:
167      database: database
168      connection_name: connection_name
169      domain: domain
170    out:
171      - log
172      - err
173
174  make_registry:
175    run: build_gridmet_model.cwl
176    doc: Writes down YAML file with the database model
177    in:
178      depends_on: init_db_schema/log
179      domain: domain
180    out:
181      - model
182      - log
183      - errors
184
185  init_tables:
186    doc: creates or recreates database tables, one for each band
187    scatter:
188      - band
189    run:
190      class: Workflow
191      inputs:
192        registry:
193          type: File
194        table:
195          type: string
196        domain:
197          type: string
198        database:
199          type: File
200        connection_name:
201          type: string
202      steps:
203        reset:
204          run: reset.cwl
205          in:
206            registry:  registry
207            domain: domain
208            database: database
209            connection_name: connection_name
210            table: table
211          out:
212            - log
213            - errors
214        index:
215          run: index.cwl
216          in:
217            depends_on: reset/log
218            registry: registry
219            domain: domain
220            table: table
221            database: database
222            connection_name: connection_name
223          out: [log, errors]
224      outputs:
225        reset_log:
226          type: File
227          outputSource: reset/log
228        reset_err:
229          type: File
230          outputSource: reset/errors
231        index_log:
232          type: File
233          outputSource: index/log
234        index_err:
235          type: File
236          outputSource: index/errors
237    in:
238      registry:  make_registry/model
239      database: database
240      connection_name: connection_name
241      band: bands
242      geography: geography
243      domain: domain
244      table:
245        valueFrom: $(inputs.geography + '_' + inputs.band)
246    out:
247      - reset_log
248      - reset_err
249      - index_log
250      - index_err
251
252  process:
253    run: gridmet_one_file.cwl
254    doc: Downloads raw data and aggregates it over shapes and time
255    scatter:
256      - band
257      - year
258    scatterMethod: nested_crossproduct
259
260    in:
261      proxy: proxy
262      depends_on: init_tables/index_log
263      model: make_registry/model
264      shapes: shapes
265      geography: geography
266      strategy: strategy
267      ram: ram
268      year: years
269      dates: dates
270      band: bands
271      database: database
272      connection_name: connection_name
273      domain: domain
274      months:
275        valueFrom: $([1,2,3,4,5,6,7,8,9,10,11,12])
276      table:
277        valueFrom: $(inputs.geography + '_' + inputs.band)
278
279    out:
280      - download_log
281      - download_err
282      - add_data_aggregate_errors
283      - add_data_data
284      - add_data_aggregate_log
285      - add_data_ingest_log
286      - add_data_ingest_errors
287      - vacuum_log
288      - vacuum_err
289
290  export:
291    run: export.cwl
292    scatter:
293      - band
294    in:
295      depends_on: process/vacuum_log
296      database: database
297      connection_name: connection_name
298      format:
299        valueFrom: "parquet"
300      domain: domain
301      geography: geography
302      band: bands
303      table:
304        valueFrom: $(inputs.domain + '.' + inputs.geography + '_' + inputs.band)
305      partition:
306        valueFrom: $(["year"])
307      output:
308        valueFrom: $('export/' + inputs.domain + '/' + inputs.geography + '_' + inputs.band)
309    out:
310      - data
311      - log
312      - errors
313
314
315
316outputs:
317  initdb_log:
318    type: File?
319    outputSource: initdb/log
320  initdb_err:
321    type: File?
322    outputSource: initdb/err
323
324  init_schema_log:
325    type: File?
326    outputSource: init_db_schema/log
327  init_schema_err:
328    type: File?
329    outputSource: init_db_schema/err
330
331  registry:
332    type: File?
333    outputSource: make_registry/model
334  registry_log:
335    type: File?
336    outputSource: make_registry/log
337  registry_err:
338    type: File?
339    outputSource: make_registry/errors
340
341  data:
342    type:
343      type: array
344      items:
345        type: array
346        items:
347          type: array
348          items: [File]
349    outputSource: process/add_data_data
350  download_log:
351    type:
352      type: array
353      items:
354        type: array
355        items: [File]
356    outputSource: process/download_log
357  download_err:
358    type:
359      type: array
360      items:
361        type: array
362        items: [File]
363    outputSource: process/download_err
364
365  process_log:
366    type:
367      type: array
368      items:
369        type: array
370        items:
371          type: array
372          items: [File]
373    outputSource: process/add_data_aggregate_log
374  process_err:
375    type:
376      type: array
377      items:
378        type: array
379        items:
380          type: array
381          items: [File]
382    outputSource: process/add_data_aggregate_errors
383
384  ingest_log:
385    type:
386      type: array
387      items:
388        type: array
389        items:
390          type: array
391          items: [File]
392    outputSource: process/add_data_ingest_log
393  ingest_err:
394    type:
395      type: array
396      items:
397        type: array
398        items:
399          type: array
400          items: [File]
401    outputSource: process/add_data_ingest_errors
402
403  reset_log:
404    type:
405      type: array
406      items: [File]
407    outputSource: init_tables/reset_log
408  reset_err:
409    type:
410      type: array
411      items: [File]
412    outputSource: init_tables/reset_err
413
414  index_log:
415    type:
416      type: array
417      items: [File]
418    outputSource: init_tables/index_log
419  index_err:
420    type:
421      type: array
422      items: [File]
423    outputSource: init_tables/index_err
424
425  vacuum_log:
426    type:
427      type: array
428      items:
429        type: array
430        items: [File]
431    outputSource: process/vacuum_log
432  vacuum_err:
433    type:
434      type: array
435      items:
436        type: array
437        items: [File]
438    outputSource: process/vacuum_err
439
440  export_data:
441    type:
442      type: array
443      items:  ['File', 'Directory']
444    outputSource: export/data
445  export_log:
446    type: File[]
447    outputSource: export/log
448  export_err:
449    type: File[]
450    outputSource: export/errors