#!/usr/bin/env python3
# -*- coding: utf-8 -*-
Created on Thu Apr 11 13:45:38 2019
Exports tables written in the C3S Climate Data Store Common Data Model (CDM) format to ascii files,
The tables format is contained in a python dictionary, stored as an attribute in a pandas.DataFrame
(or pd.io.parsers.TextFileReader).
This module uses a set of printer functions to "print" element values to a
string object before exporting them to a final ascii file.
Each of the CDM table element's has a data type (pseudo-sql as defined in the CDM documentation) which defines
which printer function needs to be used.
Numeric data types are printed with an specific number of decimal places, defined in the data element attributes. This
can vary according to each CDM, element, imodel and mapping .json file. If this is not defined in the input attributes
of the imodel, the number of decimal places used comes from a default tool defined in properties.py
@author: iregon
import os
import pandas as pd
import numpy as np
from io import StringIO
from cdm import properties
from cdm.common import pandas_TextParser_hdlr
from cdm.common import logging_hdlr
[docs]module_path = os.path.dirname(os.path.abspath(__file__))
[docs]def print_integer(data, null_label):
Prints all elements that have 'int' as type attribute
data: data tables to print
null_label: specified how nan are represented
data: data as int type
data.iloc[np.where(data.notna())] = data.iloc[np.where(data.notna())].astype(int).astype(str)
data.iloc[np.where(data.isna())] = null_label
return data
[docs]def print_float(data, null_label, decimal_places=None):
Prints all elements that have 'float' as type attribute
data: data tables to print
null_label: specified how nan are represented
decimal_places: number of decimal places
data: data as float type
decimal_places = properties.default_decimal_places if decimal_places is None else decimal_places
format_float = '{:.' + str(decimal_places) + 'f}'
data.iloc[np.where(data.notna())] = data.iloc[np.where(data.notna())].apply(format_float.format)
data.iloc[np.where(data.isna())] = null_label
return data
[docs]def print_datetime(data, null_label):
Print datetime objects in the format: "%Y-%m-%d %H:%M:%S"
data: date time elements
null_label: specified how nan are represented
data: data as datetime objects
data.iloc[np.where(data.notna())] = data.iloc[np.where(data.notna())].dt.strftime("%Y-%m-%d %H:%M:%S")
data.iloc[np.where(data.isna())] = null_label
return data
[docs]def print_varchar(data, null_label):
Prints string elements
data: data tables to print
null_label: specified how nan are represented
data: data as string objects
data.iloc[np.where(data.notna())] = data.iloc[np.where(data.notna())].astype(str)
data.iloc[np.where(data.isna())] = null_label
return data
[docs]def print_integer_array(data, null_label):
Prints a series of integer objects as array
data: data tables to print
null_label: specified how nan are represented
data: array of int objects
return data.apply(print_integer_array_i, null_label=null_label)
#TODO: tell this to dave and delete them... put error messages in fuctions above
[docs]def print_float_array(data, null_label, decimal_places=None):
return 'float array not defined in printers'
[docs]def print_datetime_array(data, null_label):
return 'datetime tz array not defined in printers'
[docs]def print_varchar_array(data, null_label):
Prints a series of string objects as array
data: data tables to print
null_label: specified how nan are represented
return data.apply(print_varchar_array_i, null_label=null_label)
[docs]printers = {'int': print_integer, 'numeric': print_float, 'varchar': print_varchar,
'timestamp with timezone': print_datetime,
'int[]': print_integer_array, 'numeric[]': print_float_array,
'varchar[]': print_varchar_array,
'timestamp with timezone[]': print_datetime_array}
[docs]iprinters_kwargs = {'numeric': ['decimal_places'],
'numeric[]': ['decimal_places']}
[docs]def print_integer_array_i(row, null_label=None):
if row == row:
row = eval(row)
row = row if isinstance(row, list) else [row]
string = ','.join(filter(bool, [str(int(x)) for x in row if np.isfinite(x)]))
if len(string) > 0:
return '{' + string + '}'
return null_label
return null_label
[docs]def print_varchar_array_i(row, null_label=None):
if row == row:
row = eval(row)
row = row if isinstance(row, list) else [row]
string = ','.join(filter(bool, row))
if len(string) > 0:
return '{' + string + '}'
return null_label
return null_label
[docs]def table_to_ascii(table, table_atts, delimiter='|', null_label='null', cdm_complete=True, filename=None,
full_table=True, log_level='INFO'):
Exports a cdm table to an ascii file.
Exports tables written in the C3S Climate Data Store Common Data Model (CDM) format to ascii files.
The tables format is contained in a python dictionary, stored as an attribute in a ``pandas.DataFrame``
(or ``pd.io.parsers.TextFileReader``).
pandas.Dataframe to export
table_atts: attributes of the pandas.Dataframe stored as a python dictionary.
This contains all element names, characteristics and types encoding,
as well as other characteristics e.g. decimal places, etc.
default '|'
specified how nan are represented
cdm_complete: if we export the entire set of tables.
default is ``True``
the name of the file to stored the data
if we export a single table
level of logging information to be saved
Saves cdm tables as ascii files
logger = logging_hdlr.init_logger(__name__, level=log_level)
empty_table = False
if 'observation_value' in table:
table.dropna(subset=['observation_value'], inplace=True)
empty_table = True if len(table) == 0 else False
elif 'observation_value' in table_atts.keys():
empty_table = True
empty_table = True if len(table) == 0 else False
if empty_table:
logger.warning('No observation values in table')
ascii_table = pd.DataFrame(columns=table_atts.keys(), dtype='object')
ascii_table.to_csv(filename, index=False, sep=delimiter, header=True, mode='w')
ascii_table = pd.DataFrame(index=table.index, columns=table_atts.keys(), dtype='object')
for iele in table_atts.keys():
if iele in table:
itype = table_atts.get(iele).get('data_type')
if printers.get(itype):
iprinter_kwargs = iprinters_kwargs.get(itype)
if iprinter_kwargs:
kwargs = {x: table_atts.get(iele).get(x) for x in iprinter_kwargs}
kwargs = {}
ascii_table[iele] = printers.get(itype)(table[iele], null_label, **kwargs)
logger.error('No printer defined for element {}'.format(iele))
ascii_table[iele] = null_label
header = True
wmode = 'w'
columns_to_ascii = [x for x in table_atts.keys() if x in table.columns] if not cdm_complete else table_atts.keys()
ascii_table.to_csv(filename, index=False, sep=delimiter, columns=columns_to_ascii, header=header, mode=wmode)
# # Convert to iterable if plain dataframe
# # This is no longer needed as the mapper now only produces real dataframes,
# # never TextParser...
# if isinstance(table,pd.DataFrame):
# table = [table]
# ichunk = 0
# for itable in table:
# # drop records with no 'observation_value'
# empty_table = False
# if 'observation_value' in itable:
# itable.dropna(subset=['observation_value'],inplace=True)
# empty_table = True if len(itable) == 0 else False
# elif 'observation_value' in table_atts.keys():
# empty_table = True
# if empty_table:
# logger.warning('No observation values in table')
# ascii_table = pd.DataFrame(columns = table_atts.keys(), dtype = 'object')
# ascii_table.to_csv(filename, index = False, sep = delimiter, header = True, mode = 'w')
# break
# ascii_table = pd.DataFrame(index = itable.index, columns = table_atts.keys(), dtype = 'object')
# for iele in table_atts.keys():
# if iele in itable:
# itype = table_atts.get(iele).get('data_type')
# if printers.get(itype):
# iprinter_kwargs = iprinters_kwargs.get(itype)
# if iprinter_kwargs:
# kwargs = { x:table_atts.get(iele).get(x) for x in iprinter_kwargs}
# else:
# kwargs = {}
# ascii_table[iele] = printers.get(itype)(itable[iele], null_label, **kwargs)
# else:
# logger.error('No printer defined for element {}'.format(iele))
# else:
# ascii_table[iele] = null_label
# header = False if ichunk > 0 else True
# wmode = 'a' if ichunk > 0 else 'w'
# columns_to_ascii = [ x for x in table_atts.keys() if x in itable.columns ]
# if not cdm_complete else table_atts.keys()
# ascii_table.to_csv(filename, index = False, sep = delimiter,
# columns = columns_to_ascii, header = header, mode = wmode)
# ichunk += 1
[docs]def cdm_to_ascii(cdm, delimiter='|', null_label='null', cdm_complete=True, extension='psv', out_dir=None, suffix=None,
prefix=None, log_level='INFO'):
Exports a complete cdm file with multiple tables to an ascii file.
Exports a complete cdm file with multiple tables written in the C3S Climate Data Store Common Data Model (CDM)
format to ascii files.
The tables format is contained in a python dictionary, stored as an attribute in a ``pandas.DataFrame``
(or ``pd.io.parsers.TextFileReader``).
common data model tables to export
default '|'
specified how nan are represented
extract the entire cdm file
default 'psv'
where to stored the ascii file
file suffix
file prefix
level of logging information
Saves the cdm tables as ascii files in the given directory with a psv extension.
logger = logging_hdlr.init_logger(__name__, level=log_level)
# Because how the printers are written, they modify the original data frame!,
# also removing rows with empty observation_value in observation_tables
extension = '.' + extension
for table in cdm.keys():
logger.info('Printing table {}'.format(table))
filename = '-'.join(filter(bool, [prefix, table, suffix])) + extension
filepath = filename if not out_dir else os.path.join(out_dir, filename)
table_to_ascii(cdm[table]['data'], cdm[table]['atts'], delimiter=delimiter, null_label=null_label,
cdm_complete=cdm_complete, filename=filepath, log_level=log_level)