Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1"""Helper functions for working with an SDCDataframe""" 

2# pylint: disable=E0401 

3from sdc_etl_libs.sdc_data_schema.schema_toolbox import SchemaToolbox 

4from sdc_etl_libs.sdc_dataframe.Dataframe import Dataframe 

5from sdc_etl_libs.sdc_dataframe.SDCDataframeEnums import SDCDFTypes 

6import pandas as pd 

7import logging 

8 

9 

10class SDCDataframeHelpers: 

11 """ 

12 SDC Dataframe Helpers 

13 """ 

14 

15 @staticmethod 

16 def check_for_null_values(sdc_dataframe_, columns_=None): 

17 """ 

18 Check for null values in an SDCDataframe. 

19 :param columns_: List. Columns to check for null values in. If None, then all columns are checked. 

20 Default value = None. 

21 :param sdc_dataframe_: An SDCDataframe. 

22 :return: List of Tuples, where: 

23 first value = column name 

24 second value = count of null values found in column 

25 """ 

26 

27 if sdc_dataframe_.type == SDCDFTypes.PANDAS: 

28 if columns_: 

29 zipped_results = zip(sdc_dataframe_.df.columns, sdc_dataframe_.df[columns_].isna().sum()) 

30 else: 

31 zipped_results = zip(sdc_dataframe_.df.columns, sdc_dataframe_.df.isna().sum()) 

32 

33 filtered_results = list(filter(lambda x: x[1] > 0, zipped_results)) 

34 

35 else: 

36 raise Exception(f"Dataframe type '{sdc_dataframe_.type}' has not been tested with this function.") 

37 

38 return filtered_results or None 

39 

40 @staticmethod 

41 def get_datetime_columns_from_schema(schema_, data_): 

42 """ 

43 Given a schema and data, creates a dataframe and 

44 checks for datetime type columns. 

45 :param schema_: (dict): schema object 

46 :param data_: data to load the DataFrame 

47 :return: (list): of strings (names of datetime fields in schema Ex. ['ModifiedDate','sms_sent_dt']) 

48 """ 

49 df_type = SchemaToolbox.determine_dataframe_type_from_schema(schema_) 

50 if df_type == SDCDFTypes.PANDAS: 

51 try: 

52 sdc_df_ = Dataframe(SDCDFTypes.PANDAS, schema_) 

53 sdc_df_.load_data(data_) 

54 df_ = sdc_df_.df 

55 columns = df_.select_dtypes(include=['datetime64', 'datetime']) 

56 fields = SchemaToolbox.get_field_names_for_file(schema_) 

57 cols = [field for field in fields if field.upper() in columns] 

58 except Exception as e: 

59 raise Exception(f"get_datetime_columns_from_schema. Exception: {e}") 

60 return cols 

61 raise Exception(f"Dataframe type '{df_type}' is not contemplated in this function.") 

62 

63 @staticmethod 

64 def concat_sdc_dataframe(schema_, sdc_dataframe_, new_sdc_dataframe_): 

65 """ 

66 Given a schema and a couple of SDC Dataframes, return a SDC Dataframe with 

67 the concatenation of the initial couple of DDC Dataframes. 

68 :param schema_: (dict): schema object 

69 :param sdc_dataframe_: SDC Dataframe 

70 :param new_sdc_dataframe_: SDC Dataframe 

71 :return: (SDC Dataframe): the concatenation of a couple of DDC Dataframes. 

72 """ 

73 key_fields = SDCDataframeHelpers.get_merge_key_fields(schema_) 

74 df_type = SchemaToolbox.determine_dataframe_type_from_schema(schema_) 

75 if sdc_dataframe_.df is None: 

76 return new_sdc_dataframe_ 

77 

78 if df_type == SDCDFTypes.PANDAS: 

79 try: 

80 if len(key_fields) > 0: 

81 sdc_dataframe_.df.set_index(keys=key_fields, inplace=True, drop=False) 

82 new_sdc_dataframe_.df.set_index(keys=key_fields, inplace=True, drop=False) 

83 df = pd.concat( 

84 [sdc_dataframe_.df[~sdc_dataframe_.df.index.isin(new_sdc_dataframe_.df.index)], 

85 new_sdc_dataframe_.df], sort=False) 

86 else: 

87 df = pd.concat([sdc_dataframe_.df, new_sdc_dataframe_.df]) 

88 

89 sdc_df_result = Dataframe(SDCDFTypes.PANDAS, schema_) 

90 sdc_df_result.df = df 

91 sdc_df_result.shape = sdc_df_result.df.shape 

92 return sdc_df_result 

93 except Exception as e: 

94 logging.exception(e) 

95 raise Exception("Error concatenating Dataframes") 

96 else: 

97 raise Exception(f"Dataframe type '{sdc_dataframe_.type}' has not been tested with this function.") 

98 

99 @staticmethod 

100 def get_merge_key_fields(schema_): 

101 """ 

102 Given a schema return a list with merge key fields 

103 :param schema_: (dict): schema object 

104 :return: (list): of strings (names of merge key fields in schema Ex. ['ModifiedDate','sms_sent_dt']) 

105 """ 

106 return list(map(lambda field: field["rename"] if ("rename" in field) else field["name"], 

107 filter(lambda field: "sf_merge_key" in field and field["sf_merge_key"] is True, 

108 schema_["fields"])))