Source code for dorieh.platform.data_model.model

#  Copyright (c) 2021. Harvard University
#
#  Developed by Research Software Engineering,
#  Faculty of Arts and Sciences, Research Computing (FAS RC)
#  Author: Michael A Bouzinier
#
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#         http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.
#

import json
import yaml
import logging
import os
import re
import csv
import datetime
import sys
import traceback
from typing import Dict, Optional

from dorieh.platform.data_model.utils import regex
from dorieh.utils.io_utils import CSVFileWrapper, basename, fopen, SpecialValues
from dorieh.platform.pg_keywords import *


PG_MAXINT = 2147483647

METADATA_TYPE_KEY = "type"
METADATA_TYPE_MODEL = "model_specification"
INDEX_DDL_PATTERN = "CREATE INDEX {option} {name} ON {table} USING {method} ({column}) {include};"
UNIQUE_INDEX_DDL_PATTERN = "CREATE UNIQUE INDEX {option} {name} ON {table} USING {method} ({column});"
INDEX_NAME_PATTERN = "{table}_{column}_idx"

BTREE = "btree"
#HASH = "hash"
HASH = BTREE  # problem building huge hash indices

index_columns = {
    "date":BTREE,
    "fips":HASH,
    "monitor":HASH,
    "name":BTREE,
    "state":HASH,
    "st_abbrev":HASH,
    "year":BTREE,
    "zip":BTREE,
    "*.code":BTREE,
    "*code":BTREE,
    "fips*": BTREE,
    "*.date":BTREE,
    "date_local":BTREE,
    "*.type":HASH,
    "*.name":BTREE
}


[docs]def index_method(c: str) -> (str,None): c = c.lower() for i in index_columns: if "*" in i: if regex(i).fullmatch(c): return index_columns[i] else: if i == c: return index_columns[i] return None
integer = re.compile("-?\d+") float_number = re.compile("(-?\d*)\.(\d+)([e|E][-|+]?\d+)?") exponent = re.compile("(-?\d+)([e|E][-|+]?\d+)") date = re.compile("([12]\d{3}-(0[1-9]|1[0-2])-(0[1-9]|[12]\d|3[01]))") SET_COLUMN = "UPDATE {table} SET {column} = {expression}"
[docs]def unquote(s: str) -> str: return s.strip().strip('"')
[docs]def load_range(n, f) -> int: for i in range(0, n): try: f() except StopIteration: return i return n
[docs]def csv_reader(data, unquote = True): if unquote: q = csv.QUOTE_ALL else: q = csv.QUOTE_NONE return csv.reader(data, quotechar='"', delimiter=',', quoting=q, skipinitialspace=True)
[docs]class CustomColumn: def __init__(self, name, type, extraction_method): self.name = name self.type = type self.extraction_method = extraction_method
[docs] def extract_value(self, input_source): if isinstance(self.extraction_method, int): n = self.extraction_method - 1 return basename(input_source).split('_')[n]
[docs]class Table: def __init__(self, metadata_file: str = None, get_entry = None, concurrent_indices: bool = False, data_file: str = None, column_name_replacement: Dict = None): metadata = None if metadata_file: fn = metadata_file.lower() with open(metadata_file) as fp: if fn.endswith("json"): metadata = json.load(fp) elif fn.endswith(".yaml") or fn.endswith(".yml"): metadata = yaml.safe_load(fp) else: raise Exception("Unsupported file type for metadata") if metadata.get(METADATA_TYPE_KEY) == METADATA_TYPE_MODEL: self.open_entry_function = get_entry with open(metadata_file) as fp: j = json.load(fp) for a in j: setattr(self, a, j[a]) if data_file: self.file_path = data_file return self.metadata = metadata if concurrent_indices: self.index_option = "CONCURRENTLY" else: self.index_option = "" self.file_path = os.path.abspath(data_file) file_name = os.path.basename(self.file_path) tname, ext = os.path.splitext(file_name) while ext not in [".csv", ""]: tname, ext = os.path.splitext(tname) tname = tname.replace('-', '_') self.table = tname self.has_commas = False self.quoted_values = False self.sql_columns = [] self.csv_columns = [] self.types = [] self.open_entry_function = get_entry self.force_manual = False self.custom_columns = [] self.create_table_ddl = None self.index_ddl = [] self.column_map = column_name_replacement \ if column_name_replacement else dict()
[docs] def save(self, to_dir): f = os.path.join(to_dir, self.table + ".json") j = dict() j[METADATA_TYPE_KEY] = METADATA_TYPE_MODEL for att in dir(self): if att[0] != '_': v = getattr(self, att) if callable(v): continue j[att] = v with open (f, "w") as config: json.dump(j, config, indent=2) logging.info("Saved table definition to: " + f)
[docs] def fopen(self, source): entry = self.open_entry_function(source) if isinstance(entry, str): return fopen(entry, "rt") else: return entry
[docs] def get_index_ddl(self, column, method): n = INDEX_NAME_PATTERN.format(table = self.table, column = column) ddl = INDEX_DDL_PATTERN \ .format(option=self.index_option, name = n, table=self.table, column=column, method = method, include = "") return n, ddl
[docs] def add_column(self, name, type, extraction_method): self.custom_columns.append(CustomColumn(name, type, extraction_method)) if extraction_method: self.force_manual = True
[docs] def make_column(self, name, type, sql, cursor, index=False, include_in_index=None): ddl = "ALTER TABLE {table} ADD {column} {type}"\ .format(table = self.table, column = name, type = type) logging.info(ddl) cursor.execute(ddl) logging.info(sql) cursor.execute(sql) if index: idx, ddl = self.get_index_ddl(column=name, method=HASH) if include_in_index: ddl += " include({})".format(include_in_index) logging.info(ddl) cursor.execute(ddl)
[docs] def make_fips_column(self, cursor): column = "fips5" s = "format('%2s', state_code)" c = "format('%3s', county_code)" e = "replace({} || {}, ' ', '0')".format(s, c) sql = SET_COLUMN.format(table=self.table, column=column, expression=e) self.make_column(column, PG_STR_TYPE, sql, cursor, True)
[docs] def make_iso_column(self, anchor, cursor, include = None): """ Add US State ISO 3166-2 code Source: https://simplemaps.com/data/us-cities :param anchor: existing column to use to calculate state iso code :param cursor: Database cursor :param include: Additional data (column) to include in index on the new column :return: """ column = "state_iso" if anchor.lower() == "state_name": e = "(SELECT iso FROM us_states " \ "WHERE us_states.state_name = {}.{})"\ .format(self.table, anchor) else: us_iso_column = { "fips": "county_fips", "fips5": "county_fips" }.get(anchor.lower(), anchor.lower()) e = "(SELECT iso FROM us_iso WHERE us_iso.{} = {}.{} LIMIT 1)"\ .format(us_iso_column, self.table, anchor) sql = SET_COLUMN.format(table=self.table, column=column, expression=e) self.make_column(column, PG_STR_TYPE, sql, cursor, index=True, include_in_index=include)
[docs] def parse_fips12(self, cursor): column = "fips5" e = "CAST(substring(fips12, 1, 5) AS INTEGER)" sql = SET_COLUMN.format(table=self.table, column=column, expression=e) # self.make_column(column, PG_STR_TYPE, sql, cursor, True) self.make_column(column, "INTEGER", sql, cursor, True)
[docs] def make_int_column(self, cursor, source: str, target:str, index: bool): type = "INTEGER" e = "CAST(NULLIF(REGEXP_REPLACE({what}, '[^0-9]', '','g'), '') AS {to})"\ .format(what=source, to=type) sql = SET_COLUMN.format(table=self.table, column=target, expression=e) self.make_column(target, type, sql, cursor, index)
[docs] def analyze(self, entry=None): if not entry: entry = self.file_path logging.info("Using for data analysis: " + name(entry)) with self.fopen(entry) as data: reader = csv_reader(data, True) row = next(reader) self.csv_columns = [unquote(c) for c in row] rows = [] load_range(10000, lambda : rows.append(next(reader))) with self.fopen(entry) as data: reader = csv_reader(data, False) next(reader) lines = [] load_range(10000, lambda : lines.append(next(reader))) if not rows: raise Exception("No data in {}".format(self.file_path)) for row in rows: for cell in row: if ',' in cell: self.has_commas = True break self.guess_types(rows, lines) self.sql_columns = [] for c in self.csv_columns: if not c: self.sql_columns.append("Col") elif c.lower() in self.column_map: self.sql_columns.append(self.column_map[c.lower()]) else: self.sql_columns.append(c.replace('.', '_').lower()) col_spec = [ "{} \t{}".format(self.sql_columns[i], self.types[i]) for i in range(0, len(self.csv_columns)) ] for c in self.custom_columns: self.sql_columns.append(c.name) col_spec.append("{} \t{}".format(c.name, c.type)) self.create_table_ddl = \ "CREATE TABLE {}\n ({})".format(self.table, ",\n\t".join(col_spec)) for c in self.sql_columns: m = index_method(c) if m: self.index_ddl.append(self.get_index_ddl(c, m))
[docs] def create(self, cursor): logging.info(self.create_table_ddl) cursor.execute(self.create_table_ddl)
[docs] def build_indices(self, cursor, flag: str = None): for ddl in self.index_ddl: command = ddl[1] name = ddl[0] if flag == INDEX_REINDEX: sql = "DROP INDEX IF EXISTS {name}".format(name=name) logging.info(str(datetime.datetime.now()) + ": " + sql) cursor.execute(sql) elif flag == INDEX_INCREMENTAL: command = command.replace(name, "IF NOT EXISTS " + name) elif flag and flag != "default": raise Exception("Invalid indexing flag: " + flag) logging.info(str(datetime.datetime.now()) + ": " + command) cursor.execute(command)
[docs] def drop(self, cursor): sql = "DROP TABLE {} CASCADE".format(self.table) logging.info(sql) cursor.execute(sql)
[docs] def type_from_metadata(self, c: int) -> Optional[str]: if not self.metadata: return None c_name = self.csv_columns[c] metadata = self.metadata if "table" in metadata: metadata = metadata["table"] if self.table in metadata: metadata = metadata[self.table] if "columns" in metadata: metadata = metadata["columns"] if c_name in metadata: return metadata[c_name] if isinstance(metadata, list): for element in metadata: if c_name in element: return element[c_name] return None
[docs] def guess_types(self, rows: list, lines: list): m = len(rows) n = len(rows[0]) self.types.clear() for c in range(0, n): c_type = self.type_from_metadata(c) precision = 0 scale = 0 max_val = 0 for l in range(0, m): v = rows[l][c].strip() v2 = lines[l][c].strip() if date.fullmatch(v): t = "DATE" elif v2 == '"{}"'.format(v): t = PG_STR_TYPE self.quoted_values = True elif SpecialValues.is_untyped(v): t = "0" else: f = float_number.fullmatch(v) if f: t = PG_NUMERIC_TYPE s = len(f.group(2)) p = len(f.group(1)) scale = max(scale, s) precision = max(precision, p) elif exponent.fullmatch(v): t = PG_NUMERIC_TYPE elif integer.fullmatch(v): t = PG_INT_TYPE else: t = PG_STR_TYPE if t == "0": continue if t in [PG_INT_TYPE]: max_val = max(max_val, abs(int(v))) if c_type == "0": c_type = t elif c_type == PG_NUMERIC_TYPE and t == PG_INT_TYPE: continue elif c_type == PG_STR_TYPE and t in [PG_INT_TYPE, PG_NUMERIC_TYPE]: continue elif c_type == PG_INT_TYPE and t == PG_NUMERIC_TYPE: c_type = t elif (c_type and c_type != t): msg = "Inconsistent type for column {:d} [{:s}]. " \ .format(c + 1, self.csv_columns[c]) msg += "Up to line {:d}: {:s}, for line={:d}: {:s}. " \ .format(l - 1, c_type, l, t) msg += "Value = {}".format(v) raise Exception(msg) else: c_type = t if c_type == PG_INT_TYPE and max_val * 10 > PG_MAXINT: c_type = PG_BIGINT_TYPE if c_type == PG_NUMERIC_TYPE: precision += scale c_type = c_type + "({:d},{:d})".format(precision + 2, scale) if c_type == "0": c_type = PG_NUMERIC_TYPE if not c_type: c_type = PG_STR_TYPE self.types.append(c_type) return
[docs] def add_data(self, cursor, entry): if self.has_commas: logging.info("The CSV file contains commas, copying manually.") self.copy_data(cursor, entry) elif self.quoted_values: logging.info("The CSV file uses quoted values.") self.copy_data(cursor, entry) elif self.force_manual: logging.info("Forcing manual copy of the data.") self.copy_data(cursor, entry) else: logging.info("Copying data using system function.") with self.fopen(entry) as data: with CSVFileWrapper(data) as csv_data: csv_data.readline() cursor.copy_from(csv_data, self.table, sep=',', null=SpecialValues.NA, columns=self.sql_columns)
[docs] def copy_data (self, cursor, input_source): N = 10000 insert = "INSERT INTO {table} ({columns}) VALUES "\ .format(table = self.table, columns=','.join(self.sql_columns)) #values = "({})".format('') with self.fopen(input_source) as data: reader = csv_reader(data, True) next(reader) lines = 0 chars = 0 sql = insert t0 = datetime.datetime.now() t1 = t0 while True: try: row = next(reader) except(StopIteration): break except: logging.info(row) traceback.print_exc(file=sys.stdout) lines += 1 chars += sum([len(cell) for cell in row]) for i in range(0, len(self.types)): if SpecialValues.is_missing(row[i]): row[i] = "NULL" elif self.types[i] in [PG_STR_TYPE, PG_DATE_TYPE, PG_TS_TYPE, PG_TIME_TYPE]: row[i] = "'{}'".format(row[i].replace("'", "''")) elif self.types[i] in [PG_INT_TYPE, PG_BIGINT_TYPE, PG_NUMERIC_TYPE] and not row[i]: row[i] = "NULL" for c in self.custom_columns: row.append(c.extract_value(input_source)) values = "({})".format(','.join(row)) sql += values if (lines % N) == 0: cursor.execute(sql) t1 = self.log_progress(t0, t1, chars, lines, N) sql = insert else: sql += "," if (lines % N) != 0: if sql[-1] == ',': sql = sql[:-1] cursor.execute(sql) self.log_progress(t0, t1, chars, lines, N) return
[docs] @staticmethod def log_progress(t0, t1, chars, lines, N): if chars > 1000000000: c = "{:7.2f}G".format(chars / 1000000000.0) elif chars > 1000000: c = "{:6.2f}M".format(chars / 1000000.0) else: c = str(chars) t = datetime.datetime.now() dt1 = t - t1 dt0 = t - t0 r1 = N / dt1.total_seconds() r0 = lines / dt0.total_seconds() logging.info("{}: Processed {:,}/{} lines/chars [t={}({}), r={:5.1f}({:5.1f}) lines/sec]" .format(str(t), lines, c, str(dt1), str(dt0), r1, r0)) return t