gridmet.cwl

  1#!/usr/bin/env cwl-runner
  2### Pipeline to aggregate data from Climatology Lab
  3#  Copyright (c) 2021-2022. Harvard University
  4#
  5#  Developed by Research Software Engineering,
  6#  Faculty of Arts and Sciences, Research Computing (FAS RC)
  7#  Author: Michael A Bouzinier
  8#
  9#  Licensed under the Apache License, Version 2.0 (the "License");
 10#  you may not use this file except in compliance with the License.
 11#  You may obtain a copy of the License at
 12#
 13#         http://www.apache.org/licenses/LICENSE-2.0
 14#
 15#  Unless required by applicable law or agreed to in writing, software
 16#  distributed under the License is distributed on an "AS IS" BASIS,
 17#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 18#  See the License for the specific language governing permissions and
 19#  limitations under the License.
 20#
 21
 22cwlVersion: v1.2
 23class: Workflow
 24
 25requirements:
 26  SubworkflowFeatureRequirement: {}
 27  StepInputExpressionRequirement: {}
 28  InlineJavascriptRequirement: {}
 29  ScatterFeatureRequirement: {}
 30  MultipleInputFeatureRequirement: {}
 31  NetworkAccess:
 32    networkAccess: True
 33
 34
 35doc: |
 36  This workflow downloads NetCDF datasets from 
 37  [University of Idaho Gridded Surface Meteorological Dataset](https://www.northwestknowledge.net/metdata/data/), 
 38  aggregates gridded data to daily mean values over chosen geographies
 39  and optionally ingests it into the database.
 40  
 41  The output of the workflow are gzipped CSV files containing
 42  aggregated data. 
 43  
 44  Optionally, the aggregated data can be ingested into a database
 45  specified in the connection parameters:
 46  
 47  * `database.ini` file containing connection descriptions
 48  * `connection_name` a string referring to a section in the `database.ini`
 49     file, identifying specific connection to be used.
 50
 51  The workflow can be invoked either by providing command line options 
 52  as in the following example:
 53  
 54      toil-cwl-runner --retryCount 1 --cleanWorkDir never \ 
 55          --outdir /scratch/work/exposures/outputs \ 
 56          --workDir /scratch/work/exposures \
 57          gridmet.cwl \  
 58          --database /opt/local/database.ini \ 
 59          --connection_name dorieh \ 
 60          --bands rmin rmax \ 
 61          --strategy auto \ 
 62          --geography zcta \ 
 63          --ram 8GB
 64
 65  Or, by providing a YaML file (see [example](jobs/test_gridmet_job)) 
 66  with similar options:
 67  
 68      toil-cwl-runner --retryCount 1 --cleanWorkDir never \ 
 69          --outdir /scratch/work/exposures/outputs \ 
 70          --workDir /scratch/work/exposures \
 71          gridmet.cwl test_gridmet_job.yml 
 72  
 73
 74inputs:
 75  proxy:
 76    type: string?
 77    default: ""
 78    doc: HTTP/HTTPS Proxy if required
 79  shapes:
 80    type: Directory?
 81    doc: Do we even need this parameter, as we instead downloading shapes?
 82  geography:
 83    type: string
 84    doc: |
 85      Type of geography: zip codes or counties
 86      Valid values: "zip", "zcta" or "county"
 87  years:
 88    type: string[]
 89    default: ['1999', '2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020']
 90  bands:
 91    doc: |
 92      University of Idaho Gridded Surface Meteorological Dataset 
 93      [bands](https://developers.google.com/earth-engine/datasets/catalog/IDAHO_EPSCOR_GRIDMET#bands)
 94    type: string[]
 95    # default: ['bi', 'erc', 'etr', 'fm100', 'fm1000', 'pet', 'pr', 'rmax', 'rmin', 'sph', 'srad', 'th', 'tmmn', 'tmmx', 'vpd', 'vs']
 96  strategy:
 97    type: string
 98    default: auto
 99    doc: |
100      [Rasterization strategy](https://nsaph-data-platform.github.io/nsaph-platform-docs/common/gridmet/doc/strategy.html)
101      used for spatial aggregation
102  ram:
103    type: string
104    default: 2GB
105    doc: |
106      Runtime memory, available to the process. When aggregation
107      strategy is `auto`, this value is used to calculate the optimal
108      downscaling factor for the available resources. 
109  database:
110    type: File
111    doc: Path to database connection file, usually database.ini
112  connection_name:
113    type: string
114    doc: The name of the section in the database.ini file
115  dates:
116    type: string?
117    doc: 'dates restriction, for testing purposes only'
118  domain:
119    type: string
120    default: climate
121
122
123steps:
124  initdb:
125    run: initdb.cwl
126    doc: Ensure that database utilities are at their latest version
127    in:
128      database: database
129      connection_name: connection_name
130    out:
131      - log
132      - err
133
134  init_db_schema:
135    doc: We need to do it because of parallel creation of tables
136    run:
137      class: CommandLineTool
138      baseCommand: [python, -m, dorieh.platform.util.psql]
139      doc: |
140        This tool executes an SQL statement in the database to grant
141        read privileges to NSAPH users (memebrs of group nsaph_admin)
142      inputs:
143        database:
144          type: File
145          doc: Path to database connection file, usually database.ini
146          inputBinding:
147            prefix: --db
148        connection_name:
149          type: string
150          doc: The name of the section in the database.ini file
151          inputBinding:
152            prefix: --connection
153        domain:
154          type: string
155          #default: climate
156      arguments:
157        - valueFrom: $("CREATE SCHEMA IF NOT EXISTS " + inputs.domain + ';')
158          position: 3
159      outputs:
160        log:
161          type: stdout
162        err:
163          type: stderr
164      stderr: "schema.err"
165      stdout: "schema.log"
166    in:
167      database: database
168      connection_name: connection_name
169      domain: domain
170    out:
171      - log
172      - err
173
174  make_registry:
175    run: build_gridmet_model.cwl
176    doc: Writes down YAML file with the database model
177    in:
178      depends_on: init_db_schema/log
179      domain: domain
180    out:
181      - model
182      - log
183      - errors
184
185  init_tables:
186    doc: creates or recreates database tables, one for each band
187    scatter:
188      - band
189    run:
190      class: Workflow
191      inputs:
192        registry:
193          type: File
194        table:
195          type: string
196        domain:
197          type: string
198        database:
199          type: File
200        connection_name:
201          type: string
202      steps:
203        reset:
204          run: reset.cwl
205          in:
206            registry:  registry
207            domain: domain
208            database: database
209            connection_name: connection_name
210            table: table
211          out:
212            - log
213            - errors
214        index:
215          run: index.cwl
216          in:
217            depends_on: reset/log
218            registry: registry
219            domain: domain
220            table: table
221            database: database
222            connection_name: connection_name
223          out: [log, errors]
224      outputs:
225        reset_log:
226          type: File
227          outputSource: reset/log
228        reset_err:
229          type: File
230          outputSource: reset/errors
231        index_log:
232          type: File
233          outputSource: index/log
234        index_err:
235          type: File
236          outputSource: index/errors
237    in:
238      registry:  make_registry/model
239      database: database
240      connection_name: connection_name
241      band: bands
242      geography: geography
243      domain: domain
244      table:
245        valueFrom: $(inputs.geography + '_' + inputs.band)
246    out:
247      - reset_log
248      - reset_err
249      - index_log
250      - index_err
251
252  process:
253    run: gridmet_one_file.cwl
254    doc: Downloads raw data and aggregates it over shapes and time
255    scatter:
256      - band
257      - year
258    scatterMethod: nested_crossproduct
259
260    in:
261      proxy: proxy
262      depends_on: init_tables/index_log
263      model: make_registry/model
264      shapes: shapes
265      geography: geography
266      strategy: strategy
267      ram: ram
268      year: years
269      dates: dates
270      band: bands
271      database: database
272      connection_name: connection_name
273      domain: domain
274      months:
275        valueFrom: $([1,2,3,4,5,6,7,8,9,10,11,12])
276      table:
277        valueFrom: $(inputs.geography + '_' + inputs.band)
278
279    out:
280      - download_log
281      - download_err
282      - add_data_aggregate_errors
283      - add_data_data
284      - add_data_aggregate_log
285      - add_data_ingest_log
286      - add_data_ingest_errors
287      - vacuum_log
288      - vacuum_err
289
290outputs:
291  initdb_log:
292    type: File?
293    outputSource: initdb/log
294  initdb_err:
295    type: File?
296    outputSource: initdb/err
297
298  init_schema_log:
299    type: File?
300    outputSource: init_db_schema/log
301  init_schema_err:
302    type: File?
303    outputSource: init_db_schema/err
304
305  registry:
306    type: File?
307    outputSource: make_registry/model
308  registry_log:
309    type: File?
310    outputSource: make_registry/log
311  registry_err:
312    type: File?
313    outputSource: make_registry/errors
314
315  data:
316    type:
317      type: array
318      items:
319        type: array
320        items:
321          type: array
322          items: [File]
323    outputSource: process/add_data_data
324  download_log:
325    type:
326      type: array
327      items:
328        type: array
329        items: [File]
330    outputSource: process/download_log
331  download_err:
332    type:
333      type: array
334      items:
335        type: array
336        items: [File]
337    outputSource: process/download_err
338
339  process_log:
340    type:
341      type: array
342      items:
343        type: array
344        items:
345          type: array
346          items: [File]
347    outputSource: process/add_data_aggregate_log
348  process_err:
349    type:
350      type: array
351      items:
352        type: array
353        items:
354          type: array
355          items: [File]
356    outputSource: process/add_data_aggregate_errors
357
358  ingest_log:
359    type:
360      type: array
361      items:
362        type: array
363        items:
364          type: array
365          items: [File]
366    outputSource: process/add_data_ingest_log
367  ingest_err:
368    type:
369      type: array
370      items:
371        type: array
372        items:
373          type: array
374          items: [File]
375    outputSource: process/add_data_ingest_errors
376
377  reset_log:
378    type:
379      type: array
380      items: [File]
381    outputSource: init_tables/reset_log
382  reset_err:
383    type:
384      type: array
385      items: [File]
386    outputSource: init_tables/reset_err
387
388  index_log:
389    type:
390      type: array
391      items: [File]
392    outputSource: init_tables/index_log
393  index_err:
394    type:
395      type: array
396      items: [File]
397    outputSource: init_tables/index_err
398
399  vacuum_log:
400    type:
401      type: array
402      items:
403        type: array
404        items: [File]
405    outputSource: process/vacuum_log
406  vacuum_err:
407    type:
408      type: array
409      items:
410        type: array
411        items: [File]
412    outputSource: process/vacuum_err