#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Apr 11 13:45:38 2019
Exports tables written in the C3S Climate Data Store Common Data Model (CDM) format to ascii files,
The tables format is contained in a python dictionary, stored as an attribute in a pandas.DataFrame
(or pd.io.parsers.TextFileReader).
This module uses a set of printer functions to "print" element values to a
string object before exporting them to a final ascii file.
Each of the CDM table element's has a data type (pseudo-sql as defined in the CDM documentation) which defines
which printer function needs to be used.
Numeric data types are printed with an specific number of decimal places, defined in the data element attributes. This
can vary according to each CDM, element, imodel and mapping .json file. If this is not defined in the input attributes
of the imodel, the number of decimal places used comes from a default tool defined in properties.py
@author: iregon
"""
import os
import pandas as pd
import numpy as np
from io import StringIO
from cdm import properties
from cdm.common import pandas_TextParser_hdlr
from cdm.common import logging_hdlr
[docs]module_path = os.path.dirname(os.path.abspath(__file__))
[docs]def print_integer(data, null_label):
"""
Prints all elements that have 'int' as type attribute
Parameters
----------
data: data tables to print
null_label: specified how nan are represented
Returns
-------
data: data as int type
"""
data.iloc[np.where(data.notna())] = data.iloc[np.where(data.notna())].astype(int).astype(str)
data.iloc[np.where(data.isna())] = null_label
return data
[docs]def print_float(data, null_label, decimal_places=None):
"""
Prints all elements that have 'float' as type attribute
Parameters
----------
data: data tables to print
null_label: specified how nan are represented
decimal_places: number of decimal places
Returns
-------
data: data as float type
"""
decimal_places = properties.default_decimal_places if decimal_places is None else decimal_places
format_float = '{:.' + str(decimal_places) + 'f}'
data.iloc[np.where(data.notna())] = data.iloc[np.where(data.notna())].apply(format_float.format)
data.iloc[np.where(data.isna())] = null_label
return data
[docs]def print_datetime(data, null_label):
"""
Print datetime objects in the format: "%Y-%m-%d %H:%M:%S"
Parameters
----------
data: date time elements
null_label: specified how nan are represented
Returns
-------
data: data as datetime objects
"""
data.iloc[np.where(data.notna())] = data.iloc[np.where(data.notna())].dt.strftime("%Y-%m-%d %H:%M:%S")
data.iloc[np.where(data.isna())] = null_label
return data
[docs]def print_varchar(data, null_label):
"""
Prints string elements
Parameters
----------
data: data tables to print
null_label: specified how nan are represented
Returns
-------
data: data as string objects
"""
data.iloc[np.where(data.notna())] = data.iloc[np.where(data.notna())].astype(str)
data.iloc[np.where(data.isna())] = null_label
return data
[docs]def print_integer_array(data, null_label):
"""
Prints a series of integer objects as array
Parameters
----------
data: data tables to print
null_label: specified how nan are represented
Returns
-------
data: array of int objects
"""
return data.apply(print_integer_array_i, null_label=null_label)
#TODO: tell this to dave and delete them... put error messages in fuctions above
[docs]def print_float_array(data, null_label, decimal_places=None):
return 'float array not defined in printers'
[docs]def print_datetime_array(data, null_label):
return 'datetime tz array not defined in printers'
[docs]def print_varchar_array(data, null_label):
"""
Prints a series of string objects as array
Parameters
----------
data: data tables to print
null_label: specified how nan are represented
Returns
-------
"""
return data.apply(print_varchar_array_i, null_label=null_label)
[docs]printers = {'int': print_integer, 'numeric': print_float, 'varchar': print_varchar,
'timestamp with timezone': print_datetime,
'int[]': print_integer_array, 'numeric[]': print_float_array,
'varchar[]': print_varchar_array,
'timestamp with timezone[]': print_datetime_array}
[docs]iprinters_kwargs = {'numeric': ['decimal_places'],
'numeric[]': ['decimal_places']}
[docs]def print_integer_array_i(row, null_label=None):
"""
Parameters
----------
row
null_label
Returns
-------
"""
if row == row:
row = eval(row)
row = row if isinstance(row, list) else [row]
string = ','.join(filter(bool, [str(int(x)) for x in row if np.isfinite(x)]))
if len(string) > 0:
return '{' + string + '}'
else:
return null_label
else:
return null_label
[docs]def print_varchar_array_i(row, null_label=None):
"""
NEEDS DOCUMENTING
Parameters
----------
row
null_label
Returns
-------
"""
if row == row:
row = eval(row)
row = row if isinstance(row, list) else [row]
string = ','.join(filter(bool, row))
if len(string) > 0:
return '{' + string + '}'
else:
return null_label
else:
return null_label
[docs]def table_to_ascii(table, table_atts, delimiter='|', null_label='null', cdm_complete=True, filename=None,
full_table=True, log_level='INFO'):
"""
Exports a cdm table to an ascii file.
Exports tables written in the C3S Climate Data Store Common Data Model (CDM) format to ascii files.
The tables format is contained in a python dictionary, stored as an attribute in a ``pandas.DataFrame``
(or ``pd.io.parsers.TextFileReader``).
Parameters
----------
table:
pandas.Dataframe to export
table_atts: attributes of the pandas.Dataframe stored as a python dictionary.
This contains all element names, characteristics and types encoding,
as well as other characteristics e.g. decimal places, etc.
delimiter:
default '|'
null_label:
specified how nan are represented
cdm_complete: if we export the entire set of tables.
default is ``True``
filename:
the name of the file to stored the data
full_table:
if we export a single table
log_level:
level of logging information to be saved
Returns
-------
Saves cdm tables as ascii files
"""
logger = logging_hdlr.init_logger(__name__, level=log_level)
empty_table = False
if 'observation_value' in table:
table.dropna(subset=['observation_value'], inplace=True)
empty_table = True if len(table) == 0 else False
elif 'observation_value' in table_atts.keys():
empty_table = True
else:
empty_table = True if len(table) == 0 else False
if empty_table:
logger.warning('No observation values in table')
ascii_table = pd.DataFrame(columns=table_atts.keys(), dtype='object')
ascii_table.to_csv(filename, index=False, sep=delimiter, header=True, mode='w')
return
ascii_table = pd.DataFrame(index=table.index, columns=table_atts.keys(), dtype='object')
for iele in table_atts.keys():
if iele in table:
itype = table_atts.get(iele).get('data_type')
if printers.get(itype):
iprinter_kwargs = iprinters_kwargs.get(itype)
if iprinter_kwargs:
kwargs = {x: table_atts.get(iele).get(x) for x in iprinter_kwargs}
else:
kwargs = {}
ascii_table[iele] = printers.get(itype)(table[iele], null_label, **kwargs)
else:
logger.error('No printer defined for element {}'.format(iele))
else:
ascii_table[iele] = null_label
header = True
wmode = 'w'
columns_to_ascii = [x for x in table_atts.keys() if x in table.columns] if not cdm_complete else table_atts.keys()
ascii_table.to_csv(filename, index=False, sep=delimiter, columns=columns_to_ascii, header=header, mode=wmode)
# # Convert to iterable if plain dataframe
# # This is no longer needed as the mapper now only produces real dataframes,
# # never TextParser...
# if isinstance(table,pd.DataFrame):
# table = [table]
# ichunk = 0
# for itable in table:
# # drop records with no 'observation_value'
# empty_table = False
# if 'observation_value' in itable:
# itable.dropna(subset=['observation_value'],inplace=True)
# empty_table = True if len(itable) == 0 else False
# elif 'observation_value' in table_atts.keys():
# empty_table = True
# if empty_table:
# logger.warning('No observation values in table')
# ascii_table = pd.DataFrame(columns = table_atts.keys(), dtype = 'object')
# ascii_table.to_csv(filename, index = False, sep = delimiter, header = True, mode = 'w')
# break
# ascii_table = pd.DataFrame(index = itable.index, columns = table_atts.keys(), dtype = 'object')
# for iele in table_atts.keys():
# if iele in itable:
# itype = table_atts.get(iele).get('data_type')
# if printers.get(itype):
# iprinter_kwargs = iprinters_kwargs.get(itype)
# if iprinter_kwargs:
# kwargs = { x:table_atts.get(iele).get(x) for x in iprinter_kwargs}
# else:
# kwargs = {}
# ascii_table[iele] = printers.get(itype)(itable[iele], null_label, **kwargs)
# else:
# logger.error('No printer defined for element {}'.format(iele))
# else:
# ascii_table[iele] = null_label
#
# header = False if ichunk > 0 else True
# wmode = 'a' if ichunk > 0 else 'w'
# columns_to_ascii = [ x for x in table_atts.keys() if x in itable.columns ]
# if not cdm_complete else table_atts.keys()
# ascii_table.to_csv(filename, index = False, sep = delimiter,
# columns = columns_to_ascii, header = header, mode = wmode)
# ichunk += 1
return
[docs]def cdm_to_ascii(cdm, delimiter='|', null_label='null', cdm_complete=True, extension='psv', out_dir=None, suffix=None,
prefix=None, log_level='INFO'):
"""
Exports a complete cdm file with multiple tables to an ascii file.
Exports a complete cdm file with multiple tables written in the C3S Climate Data Store Common Data Model (CDM)
format to ascii files.
The tables format is contained in a python dictionary, stored as an attribute in a ``pandas.DataFrame``
(or ``pd.io.parsers.TextFileReader``).
Parameters
----------
cdm:
common data model tables to export
delimiter:
default '|'
null_label:
specified how nan are represented
cdm_complete:
extract the entire cdm file
extension:
default 'psv'
out_dir:
where to stored the ascii file
suffix:
file suffix
prefix:
file prefix
log_level:
level of logging information
Returns
-------
Saves the cdm tables as ascii files in the given directory with a psv extension.
"""
logger = logging_hdlr.init_logger(__name__, level=log_level)
# Because how the printers are written, they modify the original data frame!,
# also removing rows with empty observation_value in observation_tables
extension = '.' + extension
for table in cdm.keys():
logger.info('Printing table {}'.format(table))
filename = '-'.join(filter(bool, [prefix, table, suffix])) + extension
filepath = filename if not out_dir else os.path.join(out_dir, filename)
table_to_ascii(cdm[table]['data'], cdm[table]['atts'], delimiter=delimiter, null_label=null_label,
cdm_complete=cdm_complete, filename=filepath, log_level=log_level)
return