aqs.cwl

  1#!/usr/bin/env cwl-runner
  2### Full EPA AQS Processing Pipeline
  3#  Copyright (c) 2021. Harvard University
  4#
  5#  Developed by Research Software Engineering,
  6#  Faculty of Arts and Sciences, Research Computing (FAS RC)
  7#  Author: Michael A Bouzinier
  8#
  9#  Licensed under the Apache License, Version 2.0 (the "License");
 10#  you may not use this file except in compliance with the License.
 11#  You may obtain a copy of the License at
 12#
 13#         http://www.apache.org/licenses/LICENSE-2.0
 14#
 15#  Unless required by applicable law or agreed to in writing, software
 16#  distributed under the License is distributed on an "AS IS" BASIS,
 17#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 18#  See the License for the specific language governing permissions and
 19#  limitations under the License.
 20#
 21
 22cwlVersion: v1.2
 23class: Workflow
 24
 25requirements:
 26  SubworkflowFeatureRequirement: {}
 27  StepInputExpressionRequirement: {}
 28  InlineJavascriptRequirement: {}
 29  ScatterFeatureRequirement: {}
 30
 31doc: |
 32  This workflow downloads AQS data from the government
 33  servers, introspects it to infer the database schema
 34  and ingests the data into the database
 35
 36  Example run:
 37  ```shell
 38  cwl-runner aqs.cwl sample_aqs_annual.yml
 39  ```
 40
 41  See [sample_aqs_annual.yml](sample_aqs.md)
 42
 43  Or
 44
 45  ```shell
 46  cwl-runner /opt/airflow/project/epa/src/cwl/aqs.cwl --database /opt/airflow/project/database.ini --connection_name nsaph2 --agregation annual --parameter_code PM25 --table pm25_annual --proxy $HTTP_PROXY
 47  ```
 48
 49
 50inputs:
 51  proxy:
 52    type: string?
 53    default: ""
 54    doc: HTTP/HTTPS Proxy if required
 55  database:
 56    type: File
 57    doc: Path to database connection file, usually database.ini
 58  connection_name:
 59    type: string
 60    doc: The name of the section in the database.ini file
 61  aggregation:
 62    type: string
 63  parameter_code:
 64    type: string
 65    doc: |
 66      Parameter code. Either a numeric code (e.g. 88101, 44201)
 67      or symbolic name (e.g. PM25, NO2).
 68      See more: [AQS Code List](https://www.epa.gov/aqs/aqs-code-list)
 69  table:
 70    doc: Name of the table to be created in the database
 71    type: string
 72  years:
 73    type: string[]
 74    doc: Years to download
 75
 76steps:
 77  initdb:
 78    run: initcoredb.cwl
 79    doc: Ensure that database utilities are at their latest version
 80    in:
 81      database: database
 82      connection_name: connection_name
 83    out:
 84      - log
 85      - err
 86
 87  download:
 88    run: download_aqs.cwl
 89    scatter: year
 90    in:
 91      year: years
 92      aggregation: aggregation
 93      parameter_code: parameter_code
 94      proxy: proxy
 95    out: [data]
 96
 97  expand:
 98    run: expand_aqs.cwl
 99    in:
100      parameter_code: parameter_code
101      input: download/data
102    out: [log, data]
103
104  introspect:
105    run: introspect.cwl
106    in:
107      depends_on: expand/log
108      input: expand/data
109      table: table
110      output:
111        valueFrom: epa.yaml
112    out: [log, model, errors]
113
114  ingest:
115    run: ingest.cwl
116    doc: Uploads data into the database
117    in:
118      registry: introspect/model
119      domain:
120        valueFrom: "epa"
121      table: table
122      input: expand/data
123      database: database
124      connection_name: connection_name
125    out: [log, errors]
126
127  index:
128    run: index.cwl
129    in:
130      depends_on: ingest/log
131      registry: introspect/model
132      domain:
133        valueFrom: "epa"
134      table: table
135      database: database
136      connection_name: connection_name
137    out: [log, errors]
138
139  vacuum:
140    run: vacuum.cwl
141    in:
142      depends_on: index/log
143      registry: introspect/model
144      domain:
145        valueFrom: "epa"
146      table: table
147      database: database
148      connection_name: connection_name
149    out: [log, errors]
150
151
152outputs:
153  initdb_log:
154    type: File
155    outputSource: initdb/log
156  expand_log:
157    type: File
158    outputSource: expand/log
159  introspect_log:
160    type: File
161    outputSource: introspect/log
162  ingest_log:
163    type: File
164    outputSource: ingest/log
165  index_log:
166    type: File
167    outputSource: index/log
168  vacuum_log:
169    type: File
170    outputSource: vacuum/log
171  data:
172    type: File
173    outputSource: expand/data
174  model:
175    type: File
176    outputSource: introspect/model
177  introspect_err:
178    type: File
179    outputSource: introspect/errors
180  ingest_err:
181    type: File
182    outputSource: ingest/errors
183  index_err:
184    type: File
185    outputSource: index/errors
186  vacuum_err:
187    type: File
188    outputSource: vacuum/errors