Source code for dorieh.utils.io_utils

#  Copyright (c) 2021-2024.  Harvard University
#
#   Developed by Research Software Engineering,
#   Harvard University Research Computing and Data (RCD) Services.
#
#   Author: Michael A Bouzinier
#
#   Licensed under the Apache License, Version 2.0 (the "License");
#   you may not use this file except in compliance with the License.
#   You may obtain a copy of the License at
#
#          http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.
#
#
#
#  Developed by Research Software Engineering,
#  Faculty of Arts and Sciences, Research Computing (FAS RC)
#  Author: Michael A Bouzinier
#
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#         http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.
#

import codecs
import csv
import datetime
import glob
import gzip
import io
import json
import logging
import os
import tarfile
import tempfile
import zipfile
from datetime import datetime, timezone
from typing import IO, List, Tuple, Callable
from abc import ABC, abstractmethod

import requests
import yaml
from dateutil.parser import parse
from requests.models import Response


logger = logging.getLogger(__name__)
HEADERS = {'User-Agent': 'NSAPH Data warehouse app; https://github.com/NSAPH-Data-Platform'}


[docs]def sizeof_fmt(num, suffix="B") -> str:
    if num is None:
        return "Unknown"
    units = ["", "K", "M", "G", "T", "P"]
    for unit in units:
        if unit == units[-1]:
            fmt = f"{num:.1f}"
        else:
            fmt = f"{num:3.1f}"
        if abs(num) < 1024.0 or unit == units[-1]:
            return fmt + f"{unit}{suffix}"
        num /= 1024.0
    return "Error calculating size"


[docs]class DownloadTask:
    def __init__(self, destination: str, urls: List = None, metadata = None):
        self.destination = destination
        if urls:
            self.urls = urls
        else:
            self.urls = []
        self.metadata = metadata

[docs]    def add_url(self, url: str):
        self.urls.append(url)

[docs]    def reset(self):
        os.remove(self.destination)

    def __str__(self):
        dest = os.path.abspath(self.destination)
        if len(self.urls) == 1:
            return "{} ==> {}".format(self.urls[0], dest)
        return "[{:d}]==> {}".format(len(self.urls), dest)

[docs]    def is_up_to_date(self, is_transformed: bool = True):
        if len(self.urls) == 1 and not is_transformed:
            return is_downloaded(self.urls[0], self.destination)
        for url in self.urls:
            if not is_downloaded(url, self.destination, 1000):
                return False
        return True


[docs]def as_stream(url: str, extension: str = ".csv", params = None, mode = None):
    """
    Returns the content of URL as a stream. In case the content is in zip
    format (excluding gzip) creates a temporary file

    :param mode: optional parameter to specify desirable mode: text or binary.
         Possible values: 't' or 'b'
    :param params: Optional. A dictionary, list of tuples or bytes
         to send as a query string.
    :param url: URL
    :param extension: optional, when the content is zip-encoded, the extension
        of the zip entry to return
    :return: Content of the URL or a zip entry
    """

    response = requests.get(url, stream=True, params=params)
    check_http_response(response)
    raw = response.raw
    if url.lower().endswith(".zip"):
        tfile = tempfile.TemporaryFile()
        download(url, tfile)
        tfile.seek(0)
        zfile = zipfile.ZipFile(tfile)
        entries = [
            e for e in zfile.namelist() if e.endswith(extension)
        ]
        assert len(entries) == 1
        stream = io.TextIOWrapper(zfile.open(entries[0]))
    else:
        if mode == 't':
            stream = io.TextIOWrapper(raw)
        else:
            stream = raw
    return stream


[docs]def as_content(url: str, params = None, mode = None):
    """
    Returns byte or text block with URL content

    :param url: URL
    :param params: Optional. A dictionary, list of tuples or bytes
         to send as a query string.
    :param mode: optional parameter to specify desirable return format:
         text or binary. Possible values: 't' or 'b', default is binary
    :return:  Content of the URL
    """

    response = requests.get(url, params=params)
    check_http_response(response)
    if mode == 't':
        return response.text
    return response.content


[docs]def as_csv_reader(url: str, mode = None) -> csv.DictReader:
    """
    An utility method to return the CSV content of the URL as CSVReader

    :param url: URL
    :return: an instance of csv.DictReader
    """
    stream = as_stream(url, mode=mode)
    reader = csv.DictReader(stream, quotechar='"', delimiter=',',
        quoting=csv.QUOTE_NONNUMERIC, skipinitialspace=True)
    return reader


[docs]def file_as_stream(filename: str, extension: str = ".csv", mode=None):
    """
    Returns the content of file as a stream. In case the content is in zip
    format (excluding gzip) creates a temporary file

    :param mode: optional parameter to specify desirable mode: text or binary.
         Possible values: 't' or 'b'
    :param filename: path to file
    :param extension: optional, when the content is zip-encoded, the extension
        of the zip entry to return
    :return: Content of the file or a zip entry
    """
    if filename.lower().endswith(".zip"):
        zfile = zipfile.ZipFile(filename)
        entries = [
            e for e in zfile.namelist() if e.endswith(extension)
        ]
        if len(entries) == 1:
            entry = entries[0]
        else:
            base_name, _ = os.path.splitext(os.path.basename(filename))
            entry = base_name + extension
            if entry not in entries:
                raise AssertionError(f"File {filename} should either contain a single entry or contain {entry}")
        stream = io.TextIOWrapper(zfile.open(entry))

    else:
        try:
            raw = open(filename, "b").read()
        except IOError as e:
            logger.exception("Cannot read %s: %s", filename, e)
            raise

        if mode == 't':
            stream = io.TextIOWrapper(raw)
        else:
            stream = raw

    return stream


[docs]def file_as_csv_reader(filename: str):
    """
    An utility method to return the CSV content of the file as CSVReader

    :param filename: path to file
    :return: an instance of csv.DictReader
    """
    stream = file_as_stream(filename)
    reader = csv.DictReader(
        stream, quotechar='"', delimiter=',',
        quoting=csv.QUOTE_NONNUMERIC, skipinitialspace=True,
    )
    return reader


[docs]def fopen(path: str, mode: str):
    """
    A wrapper to open various types of files

    :param path: Path to file
    :param mode: Opening mode
    :return: file-like object
    """
    if isinstance(path, io.BufferedReader):
        return codecs.getreader("utf-8")(path)
    if path.lower().endswith(".gz"):
        #return io.TextIOWrapper(gzip.open(path, mode))
        return gzip.open(path, mode)
    if 'b' in mode:
        return open(path, mode)
    return open(path, mode, encoding="utf-8")


[docs]def check_http_response(r: Response):
    """
    An internal method raises an exception of HTTP response is not OK

    :param r: Response
    :return: nothing, raises an exception if response is not OK
    """
    if not r.ok:
        reason = r.reason
        if not reason:
            reason = r.content
        msg = "{}: HTTP Response {:d}; Reason: {}".format(
            r.url, r.status_code, reason
        )
        raise Exception(msg)


[docs]def download(url: str, to: IO):
    """A utility method to download large binary data to a file-like object"""
    response = requests.get(url, stream=True, headers=HEADERS)
    check_http_response(response)
    for chunk in response.iter_content(chunk_size=1048576):
        to.write(chunk)
        print('#', end='')
    print('.', end=' ')
    return


[docs]def is_downloaded(url: str, target: str, check_size: int = 0) -> bool:
    """
    Checks if the same data has already been downloaded

    :param check_size: Use default value (0) if target size should be equal
        to source size. If several urls are combined when downloaded
        then specify a positive integer to check that destination file
        size is greater than the specified value. Specifying negative
        value will disable size check
    :param url: URL with data
    :param target: Destination of the downloads
    :return: True if the destination file exists and is newer than
        URL content
    """
    if os.path.isfile(target):
        response = requests.head(url, allow_redirects=True)
        check_http_response(response)
        headers = response.headers
        remote_size = int(headers.get('content-length', 0))
        remote_date = parse(headers.get('Last-Modified', 0))
        stat = os.stat(target)
        local_size = stat.st_size
        local_date = datetime.fromtimestamp(stat.st_mtime, timezone.utc)
        date_check = local_date >= remote_date
        if check_size == 0:
            size_check = local_size == remote_size
        else:
            size_check = local_size > check_size
        return date_check and size_check
    return False


[docs]def write_csv(reader: csv.DictReader,
              writer: csv.DictWriter,
              transformer=None,
              filter=None,
              write_header: bool = True):
    """
    Rewrites the CSV content optionally transforming and
    filtering rows

    :param transformer: An optional callable that tranmsforms a row in place
    :param reader: Input data as an instance of csv.DictReader
    :param writer: Output source should be provided as csv.DictWriter
    :param filter: Optionally, a callable function returning True
        for rows that should be written to the output and False for those
        that should be omitted
    :param write_header: whether to first write header row
    :return: Nothing
    """
    counter = 0
    if write_header:
        writer.writeheader()
    for row in reader:
        if transformer:
            transformer(row)
        if (not filter) or filter(row):
            writer.writerow(row)
        counter += 1
        if (counter % 10000) == 0:
            print("*", end="")
    print()


[docs]def count_lines(f):
    with fopen(f, "r") as x:
        return sum(1 for line in x)


[docs]class Collector(ABC):
    def __init__(self):
        pass

[docs]    @abstractmethod
    def writerow(self, data: List):
        pass

[docs]    def flush(self):
        pass


[docs]class CSVWriter(Collector):
    def __init__(self, out_stream):
        super().__init__()
        self.out = out_stream
        self.writer = csv.writer(out_stream,
                                 delimiter=',',
                                 quoting=csv.QUOTE_NONE)

[docs]    def writerow(self, row: List):
        self.writer.writerow(row)

[docs]    def flush(self):
        self.out.flush()


[docs]class ListCollector(Collector):
    def __init__(self):
        super().__init__()
        self.collection = []

[docs]    def writerow(self, data: List):
        self.collection.append(data)

[docs]    def get_result(self):
        return self.collection


[docs]def as_dict(json_or_yaml_file: str) -> dict:
    if isinstance(json_or_yaml_file, str) and os.path.isfile(json_or_yaml_file):
        with open(json_or_yaml_file) as f:
            ff = json_or_yaml_file.lower()
            if ff.endswith(".json"):
                content = json.load(f)
            elif ff.endswith(".yml") or ff.endswith(".yaml"):
                content = yaml.safe_load(f)
            else:
                raise Exception("Unsupported format for user request: {}"
                                .format(json_or_yaml_file) +
                                ". Supported formats are: JSON, YAML")
    elif isinstance(json_or_yaml_file, dict):
        content = json_or_yaml_file
    else:
        t = str(type(json_or_yaml_file))
        raise Exception("Unsupported type of the specification: {}".format(t))
    return content



[docs]def basename(path):
    """
    Returns a name without extension of a file or an
    archive entry

    :param path: a path to a file or archive entry
    :return: base name without full path or extension
    """

    if isinstance(path, tarfile.TarInfo):
        full_name = path.name
    else:
        full_name = str(path)
    name, _ = os.path.splitext(os.path.basename(full_name))
    return name


[docs]def is_readme(name: str) -> bool:
    """
    Checks if a file is a documentation file
    This method is used to extract some metadata from documentation
    provided as markDOwn files

    :param name:
    :return:
    """

    name = name.lower()
    if name.endswith(".md"):
        return True
    if name.startswith("readme"):
        return True
    if name.startswith("read.me"):
        return True
    if "readme" in name:
        return True
    return False


[docs]def get_entries(path: str) -> Tuple[List,Callable]:
    """
    Returns a list of entries in an archive or files in
    a directory

    :param path: path to a directory or an archive
    :return: Tuple with the list of entry names and a method
        to open these entries for reading

    """

    if path.endswith(".tar") or path.endswith(".tgz") or path.endswith(
            ".tar.gz"):
        tfile = tarfile.open(path)
        entries = [
            e for e in tfile.getmembers()
                if e.isfile() and not is_readme(e.name)
        ]
        f = lambda e: codecs.getreader("utf-8")(tfile.extractfile(e))
    elif path.endswith(".zip"):
        zfile = zipfile.ZipFile(path)
        entries = [
            e for e in zfile.namelist() if not is_readme(e)
        ]
        f = lambda e: io.TextIOWrapper(zfile.open(e))
    elif os.path.isdir(path):
        entries = [
            filename for filename in glob.iglob(
                os.path.join(path, '**', '**'),
                recursive=True
            )
            if os.path.isfile(filename) and not is_readme(filename)
        ]
        f = lambda e: fopen(e, "rt")
    elif os.path.isfile(path):
        entries = [path]
        f = lambda e: fopen(e, "rt")
    else:
        entries = [path]
        f = lambda e: e
    return entries, f


[docs]def get_readme(path:str):
    """
    Looks for a README file in the specified path
    :param path: a path to a folder or an archive
    :return: a file that is possibly a README file
    """

    encoding = "utf-8"
    if path.endswith(".tar") or path.endswith(".tgz") or path.endswith(
            ".tar.gz"):
        tfile = tarfile.open(path, encoding=encoding)
        readmes = [
            tfile.extractfile(e).read().decode(encoding) for e in tfile.getmembers()
                if e.isfile() and is_readme(e.name)
        ]
    elif path.endswith(".zip"):
        zfile = zipfile.ZipFile(path)
        readmes = [
            io.TextIOWrapper(zfile.open(e)).read()
                    for e in zfile.namelist() if is_readme(e)
        ]
    elif os.path.isdir(path):
        files = os.listdir(path)
        readmes = [f for f in files if is_readme(f)]
    else:
        readmes = None
    if readmes:
        return readmes[0]
    return None


[docs]def is_dir(path: str) -> bool:
    """
    Determine if a certain path specification refers
        to a collection of files or a single entry.
        Examples of collections are folders (directories)
        and archives

    :param path: path specification
    :return: True if specification refers to a collection of files
    """

    return (path.endswith(".tar")
            or path.endswith(".tgz")
            or path.endswith(".tar.gz")
            or path.endswith(".zip")
            or os.path.isdir(path)
    )


[docs]def is_yaml_or_json(path: str) -> bool:
    path = path.lower()
    for ext in [".yml", ".yaml", ".json"]:
        if path.endswith(ext) or path.endswith(ext + ".gz"):
            return True
    return  False


### R Dependencies

try:
    from rpy2.robjects import DataFrame, NA_Logical, NA_Real, NA_Integer, \
        NA_Character, NA_Complex
    from dorieh.utils.pyfst import vector2list, FSTReader

[docs]    def fst2csv(path: str, buffer_size = 10000):
        if not path.endswith(".fst"):
            raise Exception("Unknown format of file " + path)
        name = path[:-4]
        dest = name + ".csv.gz"
        n = 0
        t0 = datetime.now()
        with FSTReader(path, returns_mapping=True) as reader, fopen(dest, "wt") as output:
            writer = csv.DictWriter(output, reader.columns, quoting=csv.QUOTE_NONNUMERIC)
            writer.writeheader()
            width = len(reader.columns)
            for row in reader:
                writer.writerow(row)
                n += 1
                if (n % buffer_size) == 0:
                    t2 = datetime.now()
                    rate = n / (t2 - t0).seconds
                    logging.info("Read {}: {:d} x {:d}; {} {:f} rows/sec".format(path, n, width, str(t2-t0), rate))
        logger.info("Complete. Total read {}: {:d} x {:d}".format(path, width, n))
        return


[docs]    def dataframe2csv(df: DataFrame, dest: str, append: bool):
        t0 = datetime.datetime.now()
        columns = {
            df.colnames[c]: vector2list(df[c]) for c in range(df.ncol)
        }
        t1 = datetime.datetime.now()

        if append:
            mode = "at"
        else:
            mode = "wt"
        with fopen(dest, mode) as output:
            writer = csv.DictWriter(output, columns, quoting=csv.QUOTE_NONNUMERIC)
            if not append:
                writer.writeheader()
            for r in range(df.nrow):
                row = {
                    column: columns[column][r] for column in columns
                }
                writer.writerow(row)
        t2 = datetime.datetime.now()
        print("{} + {} = {}".format(str(t1-t0), str(t2-t1), str(t2-t0)))
        return

    R_NA_values = [
        NA_Logical,
        NA_Real,
        NA_Integer,
        NA_Character,
        NA_Complex,
        str(NA_Logical),
        str(NA_Real),
        str(NA_Integer),
        str(NA_Character),
        str(NA_Complex)
    ]


except:
    R_NA_values= []

[docs]class SpecialValues:
    NA = "NA"
    NaN = "NaN"

[docs]    @classmethod
    def is_missing(cls, v) -> bool:
        try:
            if v is None:
                return True
            return v in [
                cls.NA,
                cls.NaN,
            ]   + R_NA_values
        except:
            logging.error("Value: " + str(v))
            raise

[docs]    @classmethod
    def is_untyped(cls, v) -> bool:
        if not v:
            return True
        return cls.is_missing(v) or v in ['0']


[docs]class CSVFileWrapper():
    """
    A wrapper around CSV reader that does:

    * Counts characters and line read
    * Logging of the progress of the file being read
    * Performs on-the-fly replacement of null and special
      values
    """

    def __init__(self, file_like_object, sep = ',', null_replacement = SpecialValues.NA):
        self.file_like_object = file_like_object
        self.sep = sep
        self.null_replacement = null_replacement
        self.empty_string = self.sep + self.sep
        self.null_string = self.sep + self.null_replacement + sep
        self.empty_string_eol = self.sep + '\n'
        self.null_string_eol = self.sep + self.null_replacement + '\n'
        self.l = len(sep)
        self.remainder = ""
        self.line_number = 0
        self.last_printed_line_number = 0
        self.chars = 0

    def __getattr__(self, called_method):
        if called_method == "readline":
            return self._readline
        if called_method == "read":
            return self._read
        return getattr(self.file_like_object, called_method)

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_value, traceback):
        self.file_like_object.close()

    def _replace_empty(self, s: str):
        while self.empty_string in s:
            s = s.replace(self.empty_string, self.null_string)
        s = s.replace(self.empty_string_eol, self.null_string_eol)
        return s

    def _readline(self):
        line = self.file_like_object.readline()
        self.line_number += 1
        self.chars += len(line)
        return self._replace_empty(line)

    def _read(self, size, *args, **keyargs):
        if (len(self.remainder) < size):
            raw_buffer = self.file_like_object.read(size, *args, **keyargs)
            buffer = raw_buffer
            while buffer[-self.l:] == self.sep:
                next_char = self.file_like_object.read(self.l)
                buffer += next_char
            buffer = self._replace_empty(buffer)
        else:
            raw_buffer = ""
            buffer = raw_buffer
        if self.remainder:
            buffer = self.remainder + buffer
            self.remainder = ""

        if len(buffer) > size:
            self.remainder = buffer[size - len(buffer):]
            result = buffer[0:size]
        else:
            result = buffer

        self.chars += len(result)
        nl = result.count('\n')
        self.line_number += nl
        t = datetime.datetime.now()
        if (self.line_number - self.last_printed_line_number) > 1000000:
            if self.chars > 1000000000:
                c = "{:7.2f}G".format(self.chars/1000000000.0)
            elif self.chars > 1000000:
                c = "{:6.2f}M".format(self.chars/1000000.0)
            else:
                c = str(self.chars)
            dt = datetime.datetime.now() - t
            t = datetime.datetime.now()
            logging.info("{}: Processed {:,}/{} lines/chars [{}]"
                  .format(str(t), self.line_number, c, str(dt)))
            self.last_printed_line_number = self.line_number
        return result