airnow.cwl

  1#!/usr/bin/env cwl-runner
  2### Full EPA AirNow Processing Pipeline (including downloading shapefiles)
  3#  Copyright (c) 2021. Harvard University
  4#
  5#  Developed by Research Software Engineering,
  6#  Faculty of Arts and Sciences, Research Computing (FAS RC)
  7#  Author: Michael A Bouzinier
  8#
  9#  Licensed under the Apache License, Version 2.0 (the "License");
 10#  you may not use this file except in compliance with the License.
 11#  You may obtain a copy of the License at
 12#
 13#         http://www.apache.org/licenses/LICENSE-2.0
 14#
 15#  Unless required by applicable law or agreed to in writing, software
 16#  distributed under the License is distributed on an "AS IS" BASIS,
 17#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 18#  See the License for the specific language governing permissions and
 19#  limitations under the License.
 20#
 21
 22cwlVersion: v1.2
 23class: Workflow
 24
 25requirements:
 26  SubworkflowFeatureRequirement: {}
 27  StepInputExpressionRequirement: {}
 28  InlineJavascriptRequirement: {}
 29
 30doc: |
 31  This workflow downloads AirNow data from the government
 32  servers, introspects it to infer the database schema
 33  and ingests the data into the database
 34
 35  Example run:
 36  ```shell
 37  cwl-runner airnow.cwl sample_airnow.yml
 38  ```
 39
 40  See [sample_airnow.yml](sample_airnow.md)
 41
 42  Or
 43
 44  ```shell
 45  cwl-runner --parallel /opt/airflow/project/epa/src/cwl/airnow.cwl --database /opt/airflow/project/database.ini --connection_name nsaph2 --proxy $HTTP_PROXY  --api-key XXXXXXXX-YYYY-ZZZZ-XXXX-YYYYYYYYY --from 2022-01-01 --to 2022-08-31 --parameter_code pm25 --table airnow_pm25_2022
 46  ```
 47
 48inputs:
 49  proxy:
 50    type: string?
 51    default: ""
 52    doc: HTTP/HTTPS Proxy if required
 53  api-key:
 54    type: string
 55    doc: API key for AirNow
 56  database:
 57    type: File
 58    doc: Path to database connection file, usually database.ini
 59  connection_name:
 60    type: string
 61    doc: The name of the section in the database.ini file
 62  from:
 63    type: string
 64    doc: Start date for downolading, in YYYY-MM-DD format
 65  to:
 66    type: string
 67    doc: End date for downolading, in YYYY-MM-DD format
 68  parameter_code:
 69    type: string
 70    doc: |
 71      Parameter code. Either a numeric code (e.g. 88101, 44201)
 72      or symbolic name (e.g. PM25, NO2).
 73      See more: [AQS Code List](https://www.epa.gov/aqs/aqs-code-list)
 74  table:
 75    doc: Name of the table to be created in the database
 76    type: string
 77  year:
 78    type: int
 79
 80steps:
 81  get_shapes:
 82    run: get_shapes.cwl
 83    doc: |
 84      This step downloads Shape files from a given collection (TIGER/Line or GENZ) 
 85      and a geography (ZCTA or Counties) from the US Census website,
 86      for a given year or for the closest one.
 87
 88    in:
 89      year:
 90        valueFrom: $(String(inputs.yy))
 91      yy: year
 92      geo:
 93        valueFrom: "all"
 94      proxy: proxy
 95    out: [shape_files]
 96
 97  download:
 98    run: download_airnow.cwl
 99    in:
100      api-key: api-key
101      shapes: get_shapes/shape_files
102      from: from
103      to: to
104      table: table
105      parameter_code: parameter_code
106      proxy: proxy
107    out: [log, data]
108
109  introspect:
110    run: introspect.cwl
111    in:
112      depends_on: download/log
113      input: download/data
114      table: table
115      output:
116        valueFrom: epa.yaml
117    out: [log, model]
118
119  ingest:
120    run: ingest.cwl
121    doc: Uploads data into the database
122    in:
123      registry: introspect/model
124      domain:
125        valueFrom: "epa"
126      table: table
127      input: download/data
128      database: database
129      connection_name: connection_name
130    out: [log]
131
132  index:
133    run: index.cwl
134    in:
135      depends_on: ingest/log
136      registry: introspect/model
137      domain:
138        valueFrom: "epa"
139      table: table
140      database: database
141      connection_name: connection_name
142    out: [log]
143
144  vacuum:
145    run: vacuum.cwl
146    in:
147      depends_on: index/log
148      registry: introspect/model
149      domain:
150        valueFrom: "epa"
151      table: table
152      database: database
153      connection_name: connection_name
154    out: [log]
155
156
157outputs:
158  shapes_data:
159    type: File[]
160    outputSource: get_shapes/shape_files
161  download_log:
162    type: File
163    outputSource: download/log
164  ingest_log:
165    type: File
166    outputSource: ingest/log
167  index_log:
168    type: File
169    outputSource: index/log
170  vacuum_log:
171    type: File
172    outputSource: vacuum/log
173  download_data:
174    type: File
175    outputSource: download/data
176  model:
177    type: File
178    outputSource: introspect/model