Source code for cdm.table_reader.table_reader

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Apr 11 13:45:38 2019

Reads files with the CDM table format from a file system to a pandas.Dataframe.

All CDM fields are read as objects. Null values are read with the specified null value
in the table files, or as NaN if the na_values argument is set to the a specific null
value in the file.

Reads the full set of files (default), a subset or a single table, as controlled
by cdm_subset:

    - When reading multiple tables, the resulting dataframe is multi-indexed in
        the columns, with (table-name, field) as column names. Merging of tables
        occurs on the report_id field.
    - When reading a single table, the resulting dataframe has simple indexing
        in the columns.

Reads the full set of fields (default) or a subset of it, as controlled by
param col_subset:
    - When reading multiple tables (default or subset), the col_subset is a 
        dictionary like: col_subset = {table0:[columns],...tablen:[columns]}
        If a table is not specified in col_subset, all its fields are read.
    - When reading a single table, the col_subset is a list like: 
        col_subset = [columns]
    - It is assumed that the column names are all conform to the cdm field names
        in lib.tables/*.json

The full table set (header, observations-*) is assumed to be in the same directory.
 
Filenames for tables are assumed to be:
    tableName-<tb_id>.<extension>
with:
    valid tableName: as declared in properties.cdm_tables
    tb_id: any identifier including wildcards if required
    extension: defaulting to 'psv'

When specifying a subset of tables, valid names are those in properties.cdm_tables

@author: iregon
"""

import os
import pandas as pd
from cdm import properties
from cdm.common import logging_hdlr
import glob

[docs]module_path = os.path.dirname(os.path.abspath(__file__))


[docs]def read_tables(tb_path, tb_id, cdm_subset=None, delimiter='|',
                extension='psv', col_subset=None, log_level='INFO', na_values=[]):
    """
    Reads CDM table like files from file system to a pandas data frame.

    Parameters
    ----------
    tb_path:
        path to the file
    tb_id:
        any identifier including wildcards if required extension, defaulting to 'psv'
    cdm_subset: specifies a subset of tables or a single table.
            - For multiple subsets of tables: This option will return a pandas.Dataframe that is multi-index at
            the columns, with (table-name, field) as column names. Tables are merged via the report_id field.
            - For a single table: the function returns a pandas.Dataframe with a simple indexing for the columns.

    delimiter:
        default is '|'
    extension:
        default is psv
    col_subset: a python dictionary specifying the section or sections of the file to read
            - For multiple sections of the tables:
                e.g ``col_subset = {table0:[columns],...tablen:[columns]}``
            - For a single section:
                e.g. ``list type object col_subset = [columns]``
                This variable assumes that the column names are all conform to the cdm field names in lib.tables/*.json
    log_level: Level of logging messages to save
    na_values: specifies the format of NaN values

    Returns
    -------
    pandas.Dataframe: either the entire file or a subset of it.
    logger.error: logs specific messages if there is any error.
    """
    logger = logging_hdlr.init_logger(__name__, level=log_level)
    # Because how the printers are written, they modify the original data frame!,
    # also removing rows with empty observation_value in observation_tables
    if not os.path.isdir(tb_path):
        logger.error('Data path not found {}: '.format(tb_path))
        return

    # See if theres anything at all:
    files = glob.glob(os.path.join(tb_path, '*' + tb_id + '*.' + extension))
    if len(files) == 0:
        logger.error('No files found matching pattern {}'.format(tb_id))
        return

        # See if subset, if any of the tables is not as specs
    if cdm_subset:
        for tb in cdm_subset:
            if tb not in properties.cdm_tables:
                logger.error('Requested table {} not defined in CDM'.format(tb))
                return

    tables = properties.cdm_tables if not cdm_subset else cdm_subset
    file_patterns = {tb: os.path.join(tb_path, '-'.join([tb, tb_id]) + '.' + extension) for tb in tables}
    file_paths = {}
    for k, v in file_patterns.items():
        logger.info('Getting file path for pattern {}'.format(v))
        file_path = glob.glob(v)
        if len(file_path) == 1:
            file_paths[k] = file_path[0]
        elif len(file_path) > 1:
            logger.error(
                'Pattern {0} resulted in multiple files for table {1}. '
                'Cannot seccurely retrieve cdm table(s)'.format(tb_id, k))
            return

    if len(file_paths) == 0:
        logger.error(
            'No cdm table files found for search patterns {0}: '.format(','.join(list(file_patterns.values()))))
        return

    usecols = None if len(tables) == 1 else {table: None for table in tables}
    if col_subset:
        if len(tables) == 1:
            if not isinstance(col_subset, list):
                logger.error('Column subset (col_subset) has to be declared as a list')
                return
            else:
                usecols = col_subset
        else:
            if not isinstance(col_subset, dict):
                logger.error(
                    'Column subset (col_subset) has to be declared as a dictionary '
                    'with a table:[columns] pair per table to subset')
                return
            else:
                usecols = {table: col_subset.get(table, None) for table in tables}

    logger.info('Reading into dataframe data files {}: '.format(','.join(list(file_paths.values()))))
    if len(tables) == 1:
        file_path = list(file_paths.values())[0]
        return pd.read_csv(file_path, delimiter=delimiter, usecols=usecols,
                           dtype='object', na_values=na_values, keep_default_na=False)
    else:
        df_list = []
        for tb, tb_file in file_paths.items():
            dfi = pd.read_csv(tb_file, delimiter=delimiter,
                              usecols=usecols.get(tb), dtype='object',
                              na_values=na_values, keep_default_na=False)
            if len(dfi) > 0:
                dfi.set_index('report_id', inplace=True, drop=False)
                dfi.columns = pd.MultiIndex.from_product([[tb], dfi.columns])
                df_list.append(dfi)
            else:
                logger.warning('Table {} empty in file system, not added to the final DF'.format(tb))

        if len(df_list) > 0:
            merged = pd.concat(df_list, axis=1, join='outer')
            merged.reset_index(drop=True, inplace=True)
            return merged
        else:
            logger.error('All tables empty in file system')
            return