Source code for cdm.lib.tables.tables_hdlr

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
Created on Thu Apr 11 13:45:38 2019

Module to handle C3S Climate Data Store Common Data Model (CMD) tables within
the cdm tool.

@author: iregon
"""

# we remove python2 portability regarding OrderedDictionaries:
# from collections import OrderedDict
# This is because python2 dictionaries do not keep key insertion order: this should only matter creating final tables
# tables[key] = json.load(json_file, object_pairs_hook=OrderedDict)

import os
import glob
import json
import requests
import csv
from cdm.common import logging_hdlr
from cdm import properties

[docs]module_path = os.path.dirname(os.path.abspath(__file__))
[docs]table_path = module_path


[docs]def load_tables(log_level='DEBUG'):
    """
    NEEDS DOCUMENTING

    Parameters
    ----------
    log_level:

    Returns
    -------
    logger.error:
    tables:
    """
    logger = logging_hdlr.init_logger(__name__, level=log_level)
    table_paths = glob.glob(os.path.join(table_path, '*.json'))
    table_paths = {os.path.basename(x).split(".")[0]: x for x in list(table_paths)}

    observation_tables = [x for x in properties.cdm_tables if x.startswith('observations-')]
    # Make a copy from the generic observations table for each to the observations
    # table defined in properties
    observation_path = table_paths.get('observations')
    shuss = table_paths.pop('observations', None)
    table_paths.update({observation_table: observation_path for observation_table in observation_tables})

    tables = dict()
    try:
        for key in table_paths.keys():
            with open(table_paths.get(key)) as json_file:
                tables[key] = json.load(json_file)
    except Exception as e:
        logger.error('Could not load table {}'.format(key), exc_info=True)
        return
    return tables


# cdm elements dtypes
# Mail sent may 7th to Dave. Are the types there real SQL types, or just approximations?
# Numeric type in table definition not useful here to define floats with a specific precision
# We should be able to use those definitions. Keep in mind that arrays are object type in pandas!
# Remember any int and float (int, numeric) need to be tied for the parser!!!!
# Also datetimes!
# Until CDM table definition gets clarified:
# We map from cdm table definition types to those in properties.pandas_dtypes.get('from_sql'), else: 'object'
# We update to df column dtype if is of float type


[docs]def from_glamod(table_filename, gitlinkroot=None, element_col=1, type_col=2, field_separator='\t', skip_lines=3):
    """
    Get tables from GLAMOD Git repo and format to nested dictionary with: ``{cdm_name: {'data_type':value}}``

    Parameters
    ----------
    table_filename: table filename in repo directory
    gitlinkroot: url to directory where tables are stored
    element_col: column with element names (first is 1)
    type_col: column with element data typs (first is 1)
    field_separator: tab as default
    skip_lines: numbers of lines to skip

    Returns
    -------
    NEEDS DOCUMENTING
    """

    #
    # table_filename: table filename in repo directory
    # gitlinkroot: url to directory where tables are stored
    # element_col: column with element names (first is 1)
    # type_col: column with element data typs (first is 1)
    #
    # About data type definitions in this source (table_definitions in GitHub):
    # it is not controlled vocab. and might change in the future!!!!

    # Get data types and clean primary key, optional and whitespaces: '(pk)', '*'
    logger = logging_hdlr.init_logger(__name__, level='INFO')
    if not gitlinkroot:
        gitlinkroot = 'https://github.com/glamod/common_data_model/blob/master/table_definitions/'
        logger.info('Setting gitlink root to default: {}'.format(gitlinkroot))

    gitlinkroot = gitlinkroot.replace('blob/', '')
    gitlinkroot = gitlinkroot.replace('https://', 'https://raw.')
    response = requests.get(os.path.join(gitlinkroot, table_filename))
    field_separator = '\t'
    lines = list(csv.reader(response.content.decode('utf-8').splitlines(), delimiter=field_separator))
    for i in range(0, skip_lines):
        lines.pop(0)
    return {x[element_col - 1]: {'data_type': x[type_col - 1].strip('(pk)').strip('*').strip()} for x in lines}