1#!/usr/bin/env cwl-runner
2### Full EPA AQS Processing Pipeline
3# Copyright (c) 2021. Harvard University
4#
5# Developed by Research Software Engineering,
6# Faculty of Arts and Sciences, Research Computing (FAS RC)
7# Author: Michael A Bouzinier
8#
9# Licensed under the Apache License, Version 2.0 (the "License");
10# you may not use this file except in compliance with the License.
11# You may obtain a copy of the License at
12#
13# http://www.apache.org/licenses/LICENSE-2.0
14#
15# Unless required by applicable law or agreed to in writing, software
16# distributed under the License is distributed on an "AS IS" BASIS,
17# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18# See the License for the specific language governing permissions and
19# limitations under the License.
20#
21
22cwlVersion: v1.2
23class: Workflow
24
25requirements:
26 SubworkflowFeatureRequirement: {}
27 StepInputExpressionRequirement: {}
28 InlineJavascriptRequirement: {}
29 ScatterFeatureRequirement: {}
30
31doc: |
32 This workflow downloads AQS data from the government
33 servers, introspects it to infer the database schema
34 and ingests the data into the database
35
36 Example run:
37 ```shell
38 cwl-runner aqs.cwl sample_aqs_annual.yml
39 ```
40
41 See [sample_aqs_annual.yml](sample_aqs.md)
42
43 Or
44
45 ```shell
46 cwl-runner /opt/airflow/project/epa/src/cwl/aqs.cwl --database /opt/airflow/project/database.ini --connection_name nsaph2 --agregation annual --parameter_code PM25 --table pm25_annual --proxy $HTTP_PROXY
47 ```
48
49
50inputs:
51 proxy:
52 type: string?
53 default: ""
54 doc: HTTP/HTTPS Proxy if required
55 database:
56 type: File
57 doc: Path to database connection file, usually database.ini
58 connection_name:
59 type: string
60 doc: The name of the section in the database.ini file
61 aggregation:
62 type: string
63 parameter_code:
64 type: string
65 doc: |
66 Parameter code. Either a numeric code (e.g. 88101, 44201)
67 or symbolic name (e.g. PM25, NO2).
68 See more: [AQS Code List](https://www.epa.gov/aqs/aqs-code-list)
69 table:
70 doc: Name of the table to be created in the database
71 type: string
72 years:
73 type: string[]
74 doc: Years to download
75
76steps:
77 initdb:
78 run: initcoredb.cwl
79 doc: Ensure that database utilities are at their latest version
80 in:
81 database: database
82 connection_name: connection_name
83 out:
84 - log
85 - err
86
87 download:
88 run: download_aqs.cwl
89 scatter: year
90 in:
91 year: years
92 aggregation: aggregation
93 parameter_code: parameter_code
94 proxy: proxy
95 out: [data]
96
97 expand:
98 run: expand_aqs.cwl
99 in:
100 parameter_code: parameter_code
101 input: download/data
102 out: [log, data]
103
104 introspect:
105 run: introspect.cwl
106 in:
107 depends_on: expand/log
108 input: expand/data
109 table: table
110 output:
111 valueFrom: epa.yaml
112 out: [log, model, errors]
113
114 ingest:
115 run: ingest.cwl
116 doc: Uploads data into the database
117 in:
118 depends_on: initdb/log
119 registry: introspect/model
120 domain:
121 valueFrom: "epa"
122 table: table
123 input: expand/data
124 database: database
125 connection_name: connection_name
126 out: [log, errors]
127
128 index:
129 run: index.cwl
130 in:
131 depends_on: ingest/log
132 registry: introspect/model
133 domain:
134 valueFrom: "epa"
135 table: table
136 database: database
137 connection_name: connection_name
138 out: [log, errors]
139
140 vacuum:
141 run: vacuum.cwl
142 in:
143 depends_on: index/log
144 registry: introspect/model
145 domain:
146 valueFrom: "epa"
147 table: table
148 database: database
149 connection_name: connection_name
150 out: [log, errors]
151
152 export:
153 run: export.cwl
154 in:
155 depends_on: ingest/log
156 database: database
157 connection_name: connection_name
158 format:
159 valueFrom: "parquet"
160 table_base_name: table
161 table:
162 valueFrom: $('epa.' + inputs.table_base_name)
163 partition:
164 valueFrom: $(["year"])
165 output:
166 valueFrom: $('export/' + inputs.table_base_name)
167 out:
168 - data
169 - log
170 - errors
171
172
173outputs:
174 initdb_log:
175 type: File
176 outputSource: initdb/log
177 expand_log:
178 type: File
179 outputSource: expand/log
180 introspect_log:
181 type: File
182 outputSource: introspect/log
183 ingest_log:
184 type: File
185 outputSource: ingest/log
186 index_log:
187 type: File
188 outputSource: index/log
189 vacuum_log:
190 type: File
191 outputSource: vacuum/log
192 data:
193 type: File
194 outputSource: expand/data
195 model:
196 type: File
197 outputSource: introspect/model
198 introspect_err:
199 type: File
200 outputSource: introspect/errors
201 ingest_err:
202 type: File
203 outputSource: ingest/errors
204 index_err:
205 type: File
206 outputSource: index/errors
207 vacuum_err:
208 type: File
209 outputSource: vacuum/errors
210
211 export_data:
212 type: ['File', 'Directory']
213 outputSource: export/data
214 export_log:
215 type: File
216 outputSource: export/log
217 export_err:
218 type: File
219 outputSource: export/errors