Source code for mdf_reader.validator.validate

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Apr 30 09:38:17 2019

Validates elements in a pandas DataFrame against its input data model. Output
is a boolean DataFrame

Validated elements are those with the following column_types:
    - any in properties.numeric_types: range validation
    - 'key': code table validation
    - 'datetime': because of the way they are converted, read into datetime,
    they should already be NaT if they not validate as a valid datetime. The
    correspoding mask is just created for them

DEV notes:
need to add tolerance to the numeric range validation

@author: iregon
"""

import os
import pandas as pd
import numpy as np
import logging
from .. import properties
from ..data_models import code_tables
from ..data_models import schemas

[docs]def validate_numeric(elements,data,schema): # Find thresholds in schema. Flag if not available -> warn mask = pd.DataFrame(index = data.index, data = False, columns = elements) lower = { x:schema.get(x).get('valid_min', -np.inf) for x in elements } upper = { x:schema.get(x).get('valid_max', np.inf) for x in elements } set_elements = [ x for x in lower.keys() if lower.get(x) != -np.inf and upper.get(x) != np.inf ] if len([ x for x in elements if x not in set_elements ]) > 0: logging.warning('Data numeric elements with missing upper or lower threshold: {}'.format(",".join([ str(x) for x in elements if x not in set_elements ]))) logging.warning('Corresponding upper and/or lower bounds set to +/-inf for validation') mask[elements] = ((data[elements] >= [ lower.get(x) for x in elements ] ) & (data[elements] <= [ upper.get(x) for x in elements ])) | data[elements].isna() return mask
[docs]def validate_codes(elements, data, code_tables_path, schema, supp = False): mask = pd.DataFrame(index = data.index, data = False, columns = elements) if os.path.isdir(code_tables_path): for element in elements: code_table = schema.get(element).get('codetable') if not code_table: logging.error('Code table not defined for element {}'.format(element)) logging.warning('Element mask set to False') else: code_table_path = os.path.join(code_tables_path, code_table + '.json') # Eval elements: if ._yyyy, ._xxx in name: pd.DateTimeIndex().xxxx is the element to pass # Additionally, on doing this, should make sure that element is a datetime type: if os.path.isfile(code_table_path): try: table = code_tables.read_table(code_table_path) if supp: key_elements = [ element[1] ] if not table.get('_keys') else list(table['_keys'].get(element[1])) else: key_elements = [ element ] if not table.get('_keys') else list(table['_keys'].get(element)) if supp: key_elements = [ (element[0],x) for x in key_elements ] else: key_elements = [ (properties.dummy_level,x) if not isinstance(x,tuple) else x for x in key_elements ] dtypes = { x:properties.pandas_dtypes.get(schema.get(x).get('column_type')) for x in key_elements } table_keys = code_tables.table_keys(table) table_keys_str = [ "∿".join(x) if isinstance(x,list) else x for x in table_keys ] validation_df = data[key_elements] imask = pd.Series(index = data.index, data =True) imask.iloc[np.where(validation_df.notna().all(axis = 1))[0]] = validation_df.iloc[np.where(validation_df.notna().all(axis = 1))[0],:].astype(dtypes).astype('str').apply("∿".join, axis=1).isin(table_keys_str) mask[element] = imask except Exception as e: logging.error('Error validating coded element {}:'.format(element)) logging.error('Error is {}:'.format(e)) logging.warning('Element mask set to False') else: logging.error('Error validating coded element {}:'.format(element)) logging.error('Code table file {} not found'.format(code_table_path)) logging.warning('Element mask set to False') continue else: logging.error('Code tables path {} not found'.format(code_tables_path)) logging.warning('All coded elements set to False') return mask
[docs]def validate(data, mask0, schema, code_tables_path): logging.basicConfig(format='%(levelname)s\t[%(asctime)s](%(filename)s)\t%(message)s', level=logging.INFO,datefmt='%Y%m%d %H:%M:%S',filename=None) # Check input if not isinstance(data,pd.DataFrame) or not isinstance(mask0,pd.DataFrame): logging.error('Input data and mask must be a pandas data frame object') return # Get the data elements from the input data: might be just a subset of # data model and flatten the schema to get a simple and sequential list # of elements included in the input data elements = [ x for x in data ] element_atts = schemas.df_schema(elements, schema) # See what elements we need to validate numeric_elements = [ x for x in elements if element_atts.get(x).get('column_type') in properties.numeric_types ] datetime_elements = [ x for x in elements if element_atts.get(x).get('column_type') == 'datetime' ] coded_elements = [ x for x in elements if element_atts.get(x).get('column_type') == 'key' ] if any([isinstance(x,tuple) for x in numeric_elements + datetime_elements + coded_elements ]): validated_columns = pd.MultiIndex.from_tuples(list(set(numeric_elements + coded_elements + datetime_elements))) else: validated_columns = list(set(numeric_elements + coded_elements + datetime_elements)) mask = pd.DataFrame(index = data.index, columns = data.columns) # Validate elements by dtype: # 1. Numeric elements mask[numeric_elements] = validate_numeric(numeric_elements, data, element_atts) # 2. Table coded elements # See following: in multiple keys code tables, the non parameter element, # won't have a code_table attribute in the element_atts: # So we need to check the code_table.keys files in addition to the element_atts # Additionally, a YEAR key can fail in one table, but be compliant with anbother, then, how would we mask this? # also, a YEAR defined as an integer, will undergo its own check..... # So I think we need to check nested keys as a whole, and mask only the actual parameterized element: # Get the full list of keys combinations (tuples, triplets...) and check the column combination against that: if it fails, mark the element! # Need to see how to grab the YEAR part of a datetime when YEAR comes from a datetime element # pd.DatetimeIndex(df['_datetime']).year if len(coded_elements)> 0: mask[coded_elements] = validate_codes(coded_elements, data, code_tables_path, element_atts) # 3. Datetime elements # Those declared as such in element_atts # Because of the way they are converted, read into datetime, # they should already be NaT if they not validate as a valid datetime; # let's check: hurray! they are! mask[datetime_elements] = data[datetime_elements].notna() mask[validated_columns] = mask[validated_columns].mask(mask0[validated_columns] == False, False) return mask