#!/usr/bin/env python3
# -*- coding: utf-8 -*-
#"""
#Created on Tue Apr 30 09:38:17 2019
#
#Splits string reports in sections using a data model layout.
#
#Input and output are simple pandas dataframes, with the output dataframe
#column names being the section names
#
#To work with a pandas TextParser, loop through this module.
#
#Internally works assuming highest complexity in the input data model:
#multiple non sequential sections
#
#DEV NOTES:
#
#1) make sure we use Series when working with Series, DataFrames otherwise...
#like now:
# threads[thread_id]['data'] = pd.Series(threads[thread_id]['parent_data'][0].str[0:section_len])
#instead of:
# threads[thread_id]['data'] = pd.DataFrame(threads[thread_id]['parent_data'][0].str[0:section_len])
#
#on data import in import_data.py, we use pd.read_fwf because is more general
#use, also support to chunking would make converting to series a bit dirty...
#
#2) Can we extend (do we need to?) this to reading sequential sections with
# no sentinals? apparently (see td11) we are already able to do that:
# provided the section is in a sequential parsing_order group
#
#@author: iregon
#
#Have to documents the threads approach!!!!
#
#"""
import pandas as pd
from copy import deepcopy
import logging
# ---------------------------------------------------------------------------
# FUNCTIONS TO PERFORM INITIAL SEPARATION OF SECTIONS: MAIN IS GET_SECTIONS()
# ---------------------------------------------------------------------------
[docs]def add_next_children():
global children_parsing_order, branch_parsing_order, children_group_type, children_group_number
children_parsing_order = deepcopy(threads[thread_id]['parsing_order'])
branch_parsing_order = deepcopy(threads[thread_id]['parsing_order'])
children_group_type = list(children_parsing_order[0])[0]
children_group_number = threads[thread_id]['children_group_number']
threads[thread_id]['children_no'] = 0
threads[thread_id]['children'] = []
add_children()
[docs]def add_higher_group_children():
global children_parsing_order, branch_parsing_order, children_group_type, children_group_number
children_parsing_order = deepcopy(threads[thread_id]['parsing_order'])
children_parsing_order.pop(0) # Move to next group of sections
if len(children_parsing_order) > 0:
branch_parsing_order = deepcopy(threads[thread_id]['parsing_order'])
branch_parsing_order.pop(0)
children_group_type = list(children_parsing_order[0])[0]
children_group_number = threads[thread_id]['children_group_number'] + 1
add_children()
[docs]def add_children():
if children_group_type == 's':
add_static_children()
else:
add_dynamic_children()
[docs]def add_static_children():
threads[thread_id]['children_no'] += 1
children_thread_id = str(children_group_number) + str(0) + thread_id
threads[thread_id]['children'].append(children_thread_id)
# Now build children's thread
children_section = children_parsing_order[0][children_group_type].pop(0)
grandchildren_group_number = children_group_number
if len(children_parsing_order[0][children_group_type]) == 0:
children_parsing_order.pop(0)
if len(children_parsing_order) > 0:
grandchildren_group_number += 1
else:
grandchildren_group_number = None
threads[children_thread_id] = {'parsing_order':children_parsing_order}
threads[children_thread_id]['group_number'] = children_group_number
threads[children_thread_id]['group_type'] = children_group_type
threads[children_thread_id]['section'] = children_section
threads[children_thread_id]['parent_data'] = threads[thread_id]['modulo']
threads[thread_id]['modulo'].iloc[0:0] # Remove reports from modulo
threads[children_thread_id]['children_group_number'] = grandchildren_group_number
[docs]def add_dynamic_children():
for i in range(0,len(children_parsing_order[0][children_group_type])):
branch_i_parsing_order = deepcopy(branch_parsing_order)
children_thread_id = str(children_group_number) + str(i+1) + thread_id
# Now build children's thread
children_section = children_parsing_order[0][children_group_type].pop(0)
children_idx = threads[thread_id]['modulo'].loc[threads[thread_id]['modulo'][0].str[0:sentinals_lens.get(children_section)] == sentinals.get(children_section)].index
if len(children_idx) == 0:
continue
threads[thread_id]['children'].append(children_thread_id)
threads[thread_id]['children_no'] += 1
branch_i_parsing_order[0][children_group_type].remove(children_section)
grandchildren_group_number = children_group_number
if len(branch_i_parsing_order[0][children_group_type]) == 0 or children_group_type == 'e':
branch_i_parsing_order.pop(0)
if len(children_parsing_order) > 0:
grandchildren_group_number += 1
else:
grandchildren_group_number = None
threads[children_thread_id] = {'parsing_order':branch_i_parsing_order}
threads[children_thread_id]['group_number'] = children_group_number
threads[children_thread_id]['group_type'] = children_group_type
threads[children_thread_id]['section'] = children_section
threads[children_thread_id]['parent_data'] = threads[thread_id]['modulo'].loc[children_idx]
threads[thread_id]['modulo'].drop(children_idx,inplace = True)
threads[children_thread_id]['children_group_number'] = grandchildren_group_number
if (len(threads[thread_id]['modulo'])) > 0:
add_higher_group_children()
# ---------------------------------------------------------------------------
# MAIN
# ---------------------------------------------------------------------------
[docs]def main(string_df, schema, read_sections):
"""
Returns a pandas dataframe with a report per row
and the report sections split along the columns.
Each section is a block string and only the sections
listed in read_sections parameter are output.
Parameters
----------
string_df : pandas.DataFrame
Pandas dataframe with a unique column with
the reports as a block string
schema : dict
Data source data model schema
read_sections : list
Sections to output from the complete report
Returns
-------
pandas.DataFrame
Dataframe with the report sections split
along the columns.
"""
global sentinals, section_lens, sentinals_lens
global parsing_order
# Proceed to split sections if more than one
# else return section in a named column
if len(schema['sections'].keys())> 1:
section_lens = { section: schema['sections'][section]['header'].get('length') for section in schema['sections'].keys()}
sentinals = { section: schema['sections'][section]['header'].get('sentinal') for section in schema['sections'].keys()}
sentinals_lens = { section: len(sentinals.get(section)) if sentinals.get(section) else 0 for section in sentinals.keys()}
parsing_order = schema['header']['parsing_order']
# Get sections separated: section dict has a key:value pair for each
# section in the data model. If the section does not exist in the data,
# the value is an empty pd.Series
section_dict = extract_sections(string_df)
# Paste in order (as read_sections) in a single dataframe with columns
# named as sections:
# - Drop unwanted sections
# - Keep requested but non-existent sections
df_out = pd.DataFrame()
for section in read_sections:
df_out = pd.concat([df_out,section_dict[section].rename(section)],sort = False,axis=1)
else:
df_out = string_df
df_out.columns = read_sections
return df_out