1#!/usr/bin/env cwl-runner
2### Pipeline to ingest Monthly Pollution data downloaded from WashU Box
3
4# Copyright (c) 2021-2022. Harvard University
5#
6# Developed by Research Software Engineering,
7# Faculty of Arts and Sciences, Research Computing (FAS RC)
8# Author: Michael A Bouzinier
9#
10# Licensed under the Apache License, Version 2.0 (the "License");
11# you may not use this file except in compliance with the License.
12# You may obtain a copy of the License at
13#
14# http://www.apache.org/licenses/LICENSE-2.0
15#
16# Unless required by applicable law or agreed to in writing, software
17# distributed under the License is distributed on an "AS IS" BASIS,
18# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
19# See the License for the specific language governing permissions and
20# limitations under the License.
21#
22
23cwlVersion: v1.2
24class: Workflow
25
26requirements:
27 SubworkflowFeatureRequirement: {}
28 StepInputExpressionRequirement: {}
29 InlineJavascriptRequirement: {}
30 ScatterFeatureRequirement: {}
31 MultipleInputFeatureRequirement: {}
32 NetworkAccess:
33 networkAccess: True
34
35
36doc: |
37 Workflow to aggregate pollution data coming in NetCDF format
38 over given geographies (zip codes or counties) and ingest the
39 aggregated data into the database
40
41inputs:
42 proxy:
43 type: string?
44 default: ""
45 doc: HTTP/HTTPS Proxy if required
46 shapes:
47 type: Directory?
48 doc: Do we even need this parameter, as we isntead downloading shapes?
49 shape_file_collection:
50 type: string
51 default: tiger
52 doc: |
53 [Collection of shapefiles](https://www2.census.gov/geo/tiger),
54 either GENZ or TIGER
55 downloads:
56 type: Directory
57 doc: Directory, containing files, downloaded and unpacked from WUSTL box
58 geography:
59 type: string
60 doc: |
61 Type of geography: zip codes or counties
62 Valid values: "zip" or "county"
63 years:
64 type: int[]
65 default: [2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018]
66 months:
67 type: int[]
68 default: [1,2,3,4,5,6,7,8,9,10,11,12]
69 band:
70 type: string
71 default: pm25
72 strategy:
73 type: string
74 default: downscale
75 doc: "Rasterization strategy"
76 ram:
77 type: string
78 default: 2GB
79 doc: Runtime memory, available to the process
80 database:
81 type: File
82 doc: Path to database connection file, usually database.ini
83 connection_name:
84 type: string
85 doc: The name of the section in the database.ini file
86
87steps:
88 initdb:
89 run: initdb.cwl
90 doc: Ensure that database utilities are at their latest version
91 in:
92 database: database
93 connection_name: connection_name
94 out:
95 - log
96 - err
97
98 make_table_name:
99 doc: Given variable and geography type (zip/county) evaluates table name
100 run:
101 class: ExpressionTool
102 inputs:
103 geography:
104 type: string
105 band:
106 type: string
107 expression: "$({'table': (inputs.band + '_monthly_' + inputs.geography + '_mean')})"
108 outputs:
109 table:
110 type: string
111 in:
112 geography: geography
113 band: band
114 out: [table]
115
116 init_tables:
117 doc: creates or recreates database tables, one for each band and geography
118 run: reset.cwl
119 in:
120 domain:
121 valueFrom: "exposures"
122 database: database
123 connection_name: connection_name
124 table: make_table_name/table
125 depends_on: initdb/log
126 out:
127 - log
128 - errors
129
130 process:
131 doc: Downloads raw data and aggregates it over shapes and time
132 scatter:
133 - year
134 run: wustl_one_year.cwl
135 in:
136 proxy: proxy
137 depends_on: init_tables/log
138 downloads: downloads
139 geography: geography
140 year: years
141 months: months
142 band: band
143 strategy: strategy
144 ram: ram
145 database: database
146 connection_name: connection_name
147 table: make_table_name/table
148 shape_file_collection: shape_file_collection
149 out:
150 - aggregate_data
151 - aggregate_log
152 - aggregate_err
153 - ingest_log
154 - ingest_err
155
156 index:
157 run: index.cwl
158 in:
159 depends_on: process/ingest_log
160 domain:
161 valueFrom: "exposures"
162 table: make_table_name/table
163 database: database
164 connection_name: connection_name
165 out: [log, errors]
166
167 vacuum:
168 run: vacuum.cwl
169 in:
170 depends_on: index/log
171 domain:
172 valueFrom: "exposures"
173 table: make_table_name/table
174 database: database
175 connection_name: connection_name
176 out: [log, errors]
177
178
179outputs:
180 data:
181 type:
182 type: array
183 items:
184 type: array
185 items: [File]
186 outputSource: process/aggregate_data
187
188 aggregate_log:
189 type:
190 type: array
191 items:
192 type: array
193 items: [File]
194 outputSource: process/aggregate_log
195 aggregate_err:
196 type:
197 type: array
198 items:
199 type: array
200 items: [File]
201 outputSource: process/aggregate_err
202
203 ingest_log:
204 type:
205 type: array
206 items:
207 type: array
208 items: [File]
209 outputSource: process/ingest_log
210 ingest_err:
211 type:
212 type: array
213 items:
214 type: array
215 items: [File]
216 outputSource: process/ingest_err
217
218 reset_log:
219 type: File
220 outputSource: init_tables/log
221 reset_err:
222 type: File
223 outputSource: init_tables/errors
224
225 index_log:
226 type: File
227 outputSource: index/log
228 index_err:
229 type: File
230 outputSource: index/errors
231
232 vacuum_log:
233 type: File
234 outputSource: vacuum/log
235 vacuum_err:
236 type: File
237 outputSource: vacuum/errors