Coverage for libs/sdc_etl_libs/sdc_dataframe/sdc_dataframe_helpers.py : 53%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1"""Helper functions for working with an SDCDataframe"""
2# pylint: disable=E0401
3from sdc_etl_libs.sdc_data_schema.schema_toolbox import SchemaToolbox
4from sdc_etl_libs.sdc_dataframe.Dataframe import Dataframe
5from sdc_etl_libs.sdc_dataframe.SDCDataframeEnums import SDCDFTypes
6import pandas as pd
7import logging
10class SDCDataframeHelpers:
11 """
12 SDC Dataframe Helpers
13 """
15 @staticmethod
16 def check_for_null_values(sdc_dataframe_, columns_=None):
17 """
18 Check for null values in an SDCDataframe.
19 :param columns_: List. Columns to check for null values in. If None, then all columns are checked.
20 Default value = None.
21 :param sdc_dataframe_: An SDCDataframe.
22 :return: List of Tuples, where:
23 first value = column name
24 second value = count of null values found in column
25 """
27 if sdc_dataframe_.type == SDCDFTypes.PANDAS:
28 if columns_:
29 zipped_results = zip(sdc_dataframe_.df.columns, sdc_dataframe_.df[columns_].isna().sum())
30 else:
31 zipped_results = zip(sdc_dataframe_.df.columns, sdc_dataframe_.df.isna().sum())
33 filtered_results = list(filter(lambda x: x[1] > 0, zipped_results))
35 else:
36 raise Exception(f"Dataframe type '{sdc_dataframe_.type}' has not been tested with this function.")
38 return filtered_results or None
40 @staticmethod
41 def get_datetime_columns_from_schema(schema_, data_):
42 """
43 Given a schema and data, creates a dataframe and
44 checks for datetime type columns.
45 :param schema_: (dict): schema object
46 :param data_: data to load the DataFrame
47 :return: (list): of strings (names of datetime fields in schema Ex. ['ModifiedDate','sms_sent_dt'])
48 """
49 df_type = SchemaToolbox.determine_dataframe_type_from_schema(schema_)
50 if df_type == SDCDFTypes.PANDAS:
51 try:
52 sdc_df_ = Dataframe(SDCDFTypes.PANDAS, schema_)
53 sdc_df_.load_data(data_)
54 df_ = sdc_df_.df
55 columns = df_.select_dtypes(include=['datetime64', 'datetime'])
56 fields = SchemaToolbox.get_field_names_for_file(schema_)
57 cols = [field for field in fields if field.upper() in columns]
58 except Exception as e:
59 raise Exception(f"get_datetime_columns_from_schema. Exception: {e}")
60 return cols
61 raise Exception(f"Dataframe type '{df_type}' is not contemplated in this function.")
63 @staticmethod
64 def concat_sdc_dataframe(schema_, sdc_dataframe_, new_sdc_dataframe_):
65 """
66 Given a schema and a couple of SDC Dataframes, return a SDC Dataframe with
67 the concatenation of the initial couple of DDC Dataframes.
68 :param schema_: (dict): schema object
69 :param sdc_dataframe_: SDC Dataframe
70 :param new_sdc_dataframe_: SDC Dataframe
71 :return: (SDC Dataframe): the concatenation of a couple of DDC Dataframes.
72 """
73 key_fields = SDCDataframeHelpers.get_merge_key_fields(schema_)
74 df_type = SchemaToolbox.determine_dataframe_type_from_schema(schema_)
75 if sdc_dataframe_.df is None:
76 return new_sdc_dataframe_
78 if df_type == SDCDFTypes.PANDAS:
79 try:
80 if len(key_fields) > 0:
81 sdc_dataframe_.df.set_index(keys=key_fields, inplace=True, drop=False)
82 new_sdc_dataframe_.df.set_index(keys=key_fields, inplace=True, drop=False)
83 df = pd.concat(
84 [sdc_dataframe_.df[~sdc_dataframe_.df.index.isin(new_sdc_dataframe_.df.index)],
85 new_sdc_dataframe_.df], sort=False)
86 else:
87 df = pd.concat([sdc_dataframe_.df, new_sdc_dataframe_.df])
89 sdc_df_result = Dataframe(SDCDFTypes.PANDAS, schema_)
90 sdc_df_result.df = df
91 sdc_df_result.shape = sdc_df_result.df.shape
92 return sdc_df_result
93 except Exception as e:
94 logging.exception(e)
95 raise Exception("Error concatenating Dataframes")
96 else:
97 raise Exception(f"Dataframe type '{sdc_dataframe_.type}' has not been tested with this function.")
99 @staticmethod
100 def get_merge_key_fields(schema_):
101 """
102 Given a schema return a list with merge key fields
103 :param schema_: (dict): schema object
104 :return: (list): of strings (names of merge key fields in schema Ex. ['ModifiedDate','sms_sent_dt'])
105 """
106 return list(map(lambda field: field["rename"] if ("rename" in field) else field["name"],
107 filter(lambda field: "sf_merge_key" in field and field["sf_merge_key"] is True,
108 schema_["fields"])))