wustl.cwl

  1#!/usr/bin/env cwl-runner
  2### Pipeline to ingest Monthly Pollution data downloaded from WashU Box
  3
  4#  Copyright (c) 2021-2022. Harvard University
  5#
  6#  Developed by Research Software Engineering,
  7#  Faculty of Arts and Sciences, Research Computing (FAS RC)
  8#  Author: Michael A Bouzinier
  9#
 10#  Licensed under the Apache License, Version 2.0 (the "License");
 11#  you may not use this file except in compliance with the License.
 12#  You may obtain a copy of the License at
 13#
 14#         http://www.apache.org/licenses/LICENSE-2.0
 15#
 16#  Unless required by applicable law or agreed to in writing, software
 17#  distributed under the License is distributed on an "AS IS" BASIS,
 18#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 19#  See the License for the specific language governing permissions and
 20#  limitations under the License.
 21#
 22
 23cwlVersion: v1.2
 24class: Workflow
 25
 26requirements:
 27  SubworkflowFeatureRequirement: {}
 28  StepInputExpressionRequirement: {}
 29  InlineJavascriptRequirement: {}
 30  ScatterFeatureRequirement: {}
 31  MultipleInputFeatureRequirement: {}
 32  NetworkAccess:
 33    networkAccess: True
 34
 35
 36doc: |
 37  Workflow to aggregate pollution data coming in NetCDF format
 38  over given geographies (zip codes or counties) and ingest the
 39  aggregated data into the database
 40
 41inputs:
 42  proxy:
 43    type: string?
 44    default: ""
 45    doc: HTTP/HTTPS Proxy if required
 46  shapes:
 47    type: Directory?
 48    doc: Do we even need this parameter, as we isntead downloading shapes?
 49  shape_file_collection:
 50    type: string
 51    default: tiger
 52    doc: |
 53      [Collection of shapefiles](https://www2.census.gov/geo/tiger), 
 54      either GENZ or TIGER
 55  downloads:
 56    type: Directory
 57    doc: Directory, containing files, downloaded and unpacked from WUSTL box
 58  geography:
 59    type: string
 60    doc: |
 61      Type of geography: zip codes or counties
 62      Valid values: "zip" or "county"
 63  years:
 64    type: int[]
 65    default: [2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018]
 66  months:
 67    type: int[]
 68    default: [1,2,3,4,5,6,7,8,9,10,11,12]
 69  band:
 70    type: string
 71    default: pm25
 72  strategy:
 73    type: string
 74    default: downscale
 75    doc: "Rasterization strategy"
 76  ram:
 77    type: string
 78    default: 2GB
 79    doc: Runtime memory, available to the process
 80  database:
 81    type: File
 82    doc: Path to database connection file, usually database.ini
 83  connection_name:
 84    type: string
 85    doc: The name of the section in the database.ini file
 86
 87steps:
 88  initdb:
 89    run: initdb.cwl
 90    doc: Ensure that database utilities are at their latest version
 91    in:
 92      database: database
 93      connection_name: connection_name
 94    out:
 95      - log
 96      - err
 97
 98  make_table_name:
 99    doc: Given variable and geography type (zip/county) evaluates table name
100    run:
101      class: ExpressionTool
102      inputs:
103        geography:
104          type: string
105        band:
106          type: string
107      expression: "$({'table': (inputs.band + '_monthly_' + inputs.geography + '_mean')})"
108      outputs:
109        table:
110          type: string
111    in:
112      geography: geography
113      band: band
114    out: [table]
115
116  init_tables:
117    doc: creates or recreates database tables, one for each band and geography
118    run: reset.cwl
119    in:
120      domain:
121        valueFrom: "exposures"
122      database: database
123      connection_name: connection_name
124      table: make_table_name/table
125      depends_on: initdb/log
126    out:
127      - log
128      - errors
129
130  process:
131    doc: Downloads raw data and aggregates it over shapes and time
132    scatter:
133      - year
134    run: wustl_one_year.cwl
135    in:
136      proxy: proxy
137      depends_on: init_tables/log
138      downloads: downloads
139      geography: geography
140      year: years
141      months: months
142      band: band
143      strategy: strategy
144      ram: ram
145      database: database
146      connection_name: connection_name
147      table: make_table_name/table
148      shape_file_collection: shape_file_collection
149    out:
150      - aggregate_data
151      - aggregate_log
152      - aggregate_err
153      - ingest_log
154      - ingest_err
155
156  index:
157    run: index.cwl
158    in:
159      depends_on: process/ingest_log
160      domain:
161        valueFrom: "exposures"
162      table: make_table_name/table
163      database: database
164      connection_name: connection_name
165    out: [log, errors]
166
167  vacuum:
168    run: vacuum.cwl
169    in:
170      depends_on: index/log
171      domain:
172        valueFrom: "exposures"
173      table: make_table_name/table
174      database: database
175      connection_name: connection_name
176    out: [log, errors]
177
178
179outputs:
180  data:
181    type:
182      type: array
183      items:
184        type: array
185        items: [File]
186    outputSource: process/aggregate_data
187
188  aggregate_log:
189    type:
190      type: array
191      items:
192        type: array
193        items: [File]
194    outputSource: process/aggregate_log
195  aggregate_err:
196    type:
197      type: array
198      items:
199        type: array
200        items: [File]
201    outputSource: process/aggregate_err
202
203  ingest_log:
204    type:
205      type: array
206      items:
207        type: array
208        items: [File]
209    outputSource: process/ingest_log
210  ingest_err:
211    type:
212      type: array
213      items:
214        type: array
215        items: [File]
216    outputSource: process/ingest_err
217
218  reset_log:
219    type: File
220    outputSource: init_tables/log
221  reset_err:
222    type: File
223    outputSource: init_tables/errors
224
225  index_log:
226    type: File
227    outputSource: index/log
228  index_err:
229    type: File
230    outputSource: index/errors
231
232  vacuum_log:
233    type: File
234    outputSource: vacuum/log
235  vacuum_err:
236    type: File
237    outputSource: vacuum/errors