#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Apr 11 13:45:38 2019
Reads files with the CDM table format from a file system to a pandas.Dataframe.
All CDM fields are read as objects. Null values are read with the specified null value
in the table files, or as NaN if the na_values argument is set to the a specific null
value in the file.
Reads the full set of files (default), a subset or a single table, as controlled
by cdm_subset:
- When reading multiple tables, the resulting dataframe is multi-indexed in
the columns, with (table-name, field) as column names. Merging of tables
occurs on the report_id field.
- When reading a single table, the resulting dataframe has simple indexing
in the columns.
Reads the full set of fields (default) or a subset of it, as controlled by
param col_subset:
- When reading multiple tables (default or subset), the col_subset is a
dictionary like: col_subset = {table0:[columns],...tablen:[columns]}
If a table is not specified in col_subset, all its fields are read.
- When reading a single table, the col_subset is a list like:
col_subset = [columns]
- It is assumed that the column names are all conform to the cdm field names
in lib.tables/*.json
The full table set (header, observations-*) is assumed to be in the same directory.
Filenames for tables are assumed to be:
tableName-<tb_id>.<extension>
with:
valid tableName: as declared in properties.cdm_tables
tb_id: any identifier including wildcards if required
extension: defaulting to 'psv'
When specifying a subset of tables, valid names are those in properties.cdm_tables
@author: iregon
"""
import os
import pandas as pd
from cdm import properties
from cdm.common import logging_hdlr
import glob
[docs]module_path = os.path.dirname(os.path.abspath(__file__))
[docs]def read_tables(tb_path, tb_id, cdm_subset=None, delimiter='|',
extension='psv', col_subset=None, log_level='INFO', na_values=[]):
"""
Reads CDM table like files from file system to a pandas data frame.
Parameters
----------
tb_path:
path to the file
tb_id:
any identifier including wildcards if required extension, defaulting to 'psv'
cdm_subset: specifies a subset of tables or a single table.
- For multiple subsets of tables: This option will return a pandas.Dataframe that is multi-index at
the columns, with (table-name, field) as column names. Tables are merged via the report_id field.
- For a single table: the function returns a pandas.Dataframe with a simple indexing for the columns.
delimiter:
default is '|'
extension:
default is psv
col_subset: a python dictionary specifying the section or sections of the file to read
- For multiple sections of the tables:
e.g ``col_subset = {table0:[columns],...tablen:[columns]}``
- For a single section:
e.g. ``list type object col_subset = [columns]``
This variable assumes that the column names are all conform to the cdm field names in lib.tables/*.json
log_level: Level of logging messages to save
na_values: specifies the format of NaN values
Returns
-------
pandas.Dataframe: either the entire file or a subset of it.
logger.error: logs specific messages if there is any error.
"""
logger = logging_hdlr.init_logger(__name__, level=log_level)
# Because how the printers are written, they modify the original data frame!,
# also removing rows with empty observation_value in observation_tables
if not os.path.isdir(tb_path):
logger.error('Data path not found {}: '.format(tb_path))
return
# See if theres anything at all:
files = glob.glob(os.path.join(tb_path, '*' + tb_id + '*.' + extension))
if len(files) == 0:
logger.error('No files found matching pattern {}'.format(tb_id))
return
# See if subset, if any of the tables is not as specs
if cdm_subset:
for tb in cdm_subset:
if tb not in properties.cdm_tables:
logger.error('Requested table {} not defined in CDM'.format(tb))
return
tables = properties.cdm_tables if not cdm_subset else cdm_subset
file_patterns = {tb: os.path.join(tb_path, '-'.join([tb, tb_id]) + '.' + extension) for tb in tables}
file_paths = {}
for k, v in file_patterns.items():
logger.info('Getting file path for pattern {}'.format(v))
file_path = glob.glob(v)
if len(file_path) == 1:
file_paths[k] = file_path[0]
elif len(file_path) > 1:
logger.error(
'Pattern {0} resulted in multiple files for table {1}. '
'Cannot seccurely retrieve cdm table(s)'.format(tb_id, k))
return
if len(file_paths) == 0:
logger.error(
'No cdm table files found for search patterns {0}: '.format(','.join(list(file_patterns.values()))))
return
usecols = None if len(tables) == 1 else {table: None for table in tables}
if col_subset:
if len(tables) == 1:
if not isinstance(col_subset, list):
logger.error('Column subset (col_subset) has to be declared as a list')
return
else:
usecols = col_subset
else:
if not isinstance(col_subset, dict):
logger.error(
'Column subset (col_subset) has to be declared as a dictionary '
'with a table:[columns] pair per table to subset')
return
else:
usecols = {table: col_subset.get(table, None) for table in tables}
logger.info('Reading into dataframe data files {}: '.format(','.join(list(file_paths.values()))))
if len(tables) == 1:
file_path = list(file_paths.values())[0]
return pd.read_csv(file_path, delimiter=delimiter, usecols=usecols,
dtype='object', na_values=na_values, keep_default_na=False)
else:
df_list = []
for tb, tb_file in file_paths.items():
dfi = pd.read_csv(tb_file, delimiter=delimiter,
usecols=usecols.get(tb), dtype='object',
na_values=na_values, keep_default_na=False)
if len(dfi) > 0:
dfi.set_index('report_id', inplace=True, drop=False)
dfi.columns = pd.MultiIndex.from_product([[tb], dfi.columns])
df_list.append(dfi)
else:
logger.warning('Table {} empty in file system, not added to the final DF'.format(tb))
if len(df_list) > 0:
merged = pd.concat(df_list, axis=1, join='outer')
merged.reset_index(drop=True, inplace=True)
return merged
else:
logger.error('All tables empty in file system')
return