aqs.cwl

  1#!/usr/bin/env cwl-runner
  2### Full EPA AQS Processing Pipeline
  3#  Copyright (c) 2021. Harvard University
  4#
  5#  Developed by Research Software Engineering,
  6#  Faculty of Arts and Sciences, Research Computing (FAS RC)
  7#  Author: Michael A Bouzinier
  8#
  9#  Licensed under the Apache License, Version 2.0 (the "License");
 10#  you may not use this file except in compliance with the License.
 11#  You may obtain a copy of the License at
 12#
 13#         http://www.apache.org/licenses/LICENSE-2.0
 14#
 15#  Unless required by applicable law or agreed to in writing, software
 16#  distributed under the License is distributed on an "AS IS" BASIS,
 17#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 18#  See the License for the specific language governing permissions and
 19#  limitations under the License.
 20#
 21
 22cwlVersion: v1.2
 23class: Workflow
 24
 25requirements:
 26  SubworkflowFeatureRequirement: {}
 27  StepInputExpressionRequirement: {}
 28  InlineJavascriptRequirement: {}
 29  ScatterFeatureRequirement: {}
 30
 31doc: |
 32  This workflow downloads AQS data from the government
 33  servers, introspects it to infer the database schema
 34  and ingests the data into the database
 35
 36  Example run:
 37  ```shell
 38  cwl-runner aqs.cwl sample_aqs_annual.yml
 39  ```
 40
 41  See [sample_aqs_annual.yml](sample_aqs.md)
 42
 43  Or
 44
 45  ```shell
 46  cwl-runner /opt/airflow/project/epa/src/cwl/aqs.cwl --database /opt/airflow/project/database.ini --connection_name nsaph2 --agregation annual --parameter_code PM25 --table pm25_annual --proxy $HTTP_PROXY
 47  ```
 48
 49
 50inputs:
 51  proxy:
 52    type: string?
 53    default: ""
 54    doc: HTTP/HTTPS Proxy if required
 55  database:
 56    type: File
 57    doc: Path to database connection file, usually database.ini
 58  connection_name:
 59    type: string
 60    doc: The name of the section in the database.ini file
 61  aggregation:
 62    type: string
 63  parameter_code:
 64    type: string
 65    doc: |
 66      Parameter code. Either a numeric code (e.g. 88101, 44201)
 67      or symbolic name (e.g. PM25, NO2).
 68      See more: [AQS Code List](https://www.epa.gov/aqs/aqs-code-list)
 69  table:
 70    doc: Name of the table to be created in the database
 71    type: string
 72  years:
 73    type: string[]
 74    doc: Years to download
 75
 76steps:
 77  initdb:
 78    run: initcoredb.cwl
 79    doc: Ensure that database utilities are at their latest version
 80    in:
 81      database: database
 82      connection_name: connection_name
 83    out:
 84      - log
 85      - err
 86
 87  download:
 88    run: download_aqs.cwl
 89    scatter: year
 90    in:
 91      year: years
 92      aggregation: aggregation
 93      parameter_code: parameter_code
 94      proxy: proxy
 95    out: [data]
 96
 97  expand:
 98    run: expand_aqs.cwl
 99    in:
100      parameter_code: parameter_code
101      input: download/data
102    out: [log, data]
103
104  introspect:
105    run: introspect.cwl
106    in:
107      depends_on: expand/log
108      input: expand/data
109      table: table
110      output:
111        valueFrom: epa.yaml
112    out: [log, model, errors]
113
114  ingest:
115    run: ingest.cwl
116    doc: Uploads data into the database
117    in:
118      registry: introspect/model
119      domain:
120        valueFrom: "epa"
121      table: table
122      input: expand/data
123      database: database
124      connection_name: connection_name
125    out: [log, errors]
126
127  index:
128    run: index.cwl
129    in:
130      depends_on: ingest/log
131      registry: introspect/model
132      domain:
133        valueFrom: "epa"
134      table: table
135      database: database
136      connection_name: connection_name
137    out: [log, errors]
138
139  vacuum:
140    run: vacuum.cwl
141    in:
142      depends_on: index/log
143      registry: introspect/model
144      domain:
145        valueFrom: "epa"
146      table: table
147      database: database
148      connection_name: connection_name
149    out: [log, errors]
150
151  export:
152    run: export.cwl
153    in:
154      database: database
155      connection_name: connection_name
156      format:
157        valueFrom: "parquet"
158      table_base_name: table
159      table:
160        valueFrom: $('epa.' + inputs.table_base_name)
161      partition:
162        valueFrom: $(["year"])
163      output:
164        valueFrom: $('export/' + inputs.table_base_name)
165    out:
166      - data
167      - log
168      - errors
169
170
171outputs:
172  initdb_log:
173    type: File
174    outputSource: initdb/log
175  expand_log:
176    type: File
177    outputSource: expand/log
178  introspect_log:
179    type: File
180    outputSource: introspect/log
181  ingest_log:
182    type: File
183    outputSource: ingest/log
184  index_log:
185    type: File
186    outputSource: index/log
187  vacuum_log:
188    type: File
189    outputSource: vacuum/log
190  data:
191    type: File
192    outputSource: expand/data
193  model:
194    type: File
195    outputSource: introspect/model
196  introspect_err:
197    type: File
198    outputSource: introspect/errors
199  ingest_err:
200    type: File
201    outputSource: ingest/errors
202  index_err:
203    type: File
204    outputSource: index/errors
205  vacuum_err:
206    type: File
207    outputSource: vacuum/errors
208
209  export_data:
210    type: ['File', 'Directory']
211    outputSource: export/data
212  export_log:
213    type: File
214    outputSource: export/log
215  export_err:
216    type: File
217    outputSource: export/errors