Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1import sys 

2import os 

3sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../../../") 

4from sdc_etl_libs.sdc_dataframe.Dataframe import Dataframe 

5from sdc_etl_libs.sdc_dataframe.SDCDataframeEnums import SDCDFTypes 

6import pytest 

7import pandas as pd 

8from enum import Enum 

9 

10def test_drop_columns(): 

11 test_schema = """ 

12 { 

13 "namespace": "Dataframe", 

14 "type": "object", 

15 "name": "I am a test", 

16 "fields": [ 

17 {"name":"Id1","type":{"type":"string"}}, 

18 {"name":"Id2","type":{"type":"int"}}, 

19 {"name":"Id3","type":{"type":"boolean"}}, 

20 {"name":"Id4","type":{"type":"string"}}, 

21 {"name":"Id5","type":{"type":"float"}}, 

22 {"name":"Id6","type":{"type":"string"}}, 

23 {"name":"Id7","type":{"type":"string", "logical_type":"datetime"}} 

24 ] 

25 } 

26 """ 

27 

28 df = Dataframe(SDCDFTypes.PANDAS, test_schema) 

29 

30 df.load_data([{ 

31 "Id1": "string", 

32 "Id2": 5, 

33 "Id3": True, 

34 "Id4": "string", 

35 "Id5": 5.5, 

36 "Id6": "string", 

37 "Id7": "2019-06-13T15:06:18.337Z", 

38 }]) 

39 

40 assert len(df.df.columns) == 7 

41 

42 # Test dropping 1 column 

43 df.drop_columns(column_list_=["ID1"]) 

44 assert "ID1" not in df.df.columns.tolist() 

45 

46 # Test dropping multiple columns at once 

47 df.drop_columns(["ID3", "ID7"]) 

48 assert "ID3" not in df.df.columns.tolist() 

49 assert "ID7" not in df.df.columns.tolist() 

50 

51 # Test function fails when argument passed is not a list 

52 with pytest.raises(Exception): 

53 df.drop_columns("ID1") 

54 

55def test_drop_columns_from_schema(): 

56 

57 test_schema = """ 

58 { 

59 "namespace": "Dataframe", 

60 "type": "object", 

61 "name": "I am a test", 

62 "fields": [ 

63 {"name":"Id1","type":{"type":"string"},"drop_column":true}, 

64 {"name":"Id2","type":{"type":"int"}}, 

65 {"name":"Id3","type":{"type":"boolean"}}, 

66 {"name":"Id4","type":{"type":"string"}}, 

67 {"name":"Id5","type":{"type":"float"}}, 

68 {"name":"Id6","type":{"type":"string"}}, 

69 {"name":"Id7","type":{"type":"string", "logical_type":"datetime"}} 

70 ] 

71 } 

72 """ 

73 

74 df = Dataframe(SDCDFTypes.PANDAS, test_schema) 

75 

76 df.load_data([{ 

77 "Id1": "string", 

78 "Id2": 5, 

79 "Id3": True, 

80 "Id4": "string", 

81 "Id5": 5.5, 

82 "Id6": "string", 

83 "Id7": "2019-06-13T15:06:18.337Z", 

84 }]) 

85 

86 assert len(df.df.columns) == 6 

87 

88 # Test dropping 1 column 

89 # df.drop_columns(column_list_=["ID1"]) 

90 assert "ID1" not in df.df.columns.tolist() 

91 

92 

93def test_fill_in_column(): 

94 

95 test_schema = """ 

96 { 

97 "namespace": "Dataframe", 

98 "type": "object", 

99 "name": "I am a test", 

100 "fields": [ 

101 {"name":"Id1","type":{"type":"string"}}, 

102 {"name":"Id2","type":{"type":"string", "add_column": true }} 

103 ] 

104 } 

105 """ 

106 

107 df = Dataframe(SDCDFTypes.PANDAS, test_schema) 

108 

109 df.load_data([ 

110 {"Id1": "string", "Id2": None}, 

111 {"Id1": "string", "Id2": None}, 

112 {"Id1": "string", "Id2": None}, 

113 {"Id1": "string", "Id2": None}, 

114 {"Id1": "string", "Id2": None}]) 

115 

116 assert df.df["ID2"].isnull().all() == True 

117 

118 df.fill_in_column(column_name_="ID2", column_value_="Cat", create_column_=False) 

119 

120 assert df.df["ID2"].all() == 'Cat' 

121 

122 df.fill_in_column(column_name_="ID2", column_value_="Dog", create_column_=False) 

123 

124 assert df.df["ID2"].all() == 'Dog' 

125 

126 with pytest.raises(Exception): 

127 df.fill_in_column(column_name_="ID4", column_value_="Dog", 

128 create_column_=False) 

129 

130 df.fill_in_column(column_name_="ID4", column_value_="Dog", create_column_=True) 

131 

132 assert df.df["ID4"].all() == 'Dog' 

133 

134 

135def test_concat_columns_transformation(): 

136 class TransformationType(Enum): 

137 pre = "PRE" 

138 post = "POST" 

139 

140 test_schema = """ 

141 { 

142 "namespace": "Dataframe", 

143 "type": "object", 

144 "name": "I am a test", 

145 "fields": [ 

146 {"name":"ID1","type":{"type":"string"}}, 

147 {"name":"ID2","type":{"type":"string", "add_column": true }}, 

148 {"name":"ID3", 

149 "type":{"type":"string"}, 

150 "is_pii": true, 

151 "transformations": [{"transformation_type":"PRE", "type":"concat_columns", "opts": {"list_columns": ["ID1", "ID2"], "separator":" "}}] 

152 }, 

153 {"name":"ID4", 

154 "type":{"type":"string"}, 

155 "is_pii": true, 

156 "transformations": [{"transformation_type":"POST", "type":"concat_columns", "opts": {"list_columns": ["ID1", "ID2"], "separator":" "}}] 

157 } 

158 ] 

159 } 

160 """ 

161 

162 dataframeClass = Dataframe(SDCDFTypes.PANDAS, test_schema) 

163 dataframeClass.df = pd.DataFrame([["value1", "value4"], ["value2", "value5"]], columns=['ID1', 'ID2']) 

164 dataframeClass.perform_column_transformations(TransformationType.pre) 

165 

166 assert len(dataframeClass.df) == 2 

167 assert (list(dataframeClass.df.columns.values)) == ['ID1', 'ID2', 'ID3'] 

168 assert dataframeClass.df.loc[dataframeClass.df['ID1'] == "value1"]["ID3"].item() == "value1 value4" 

169 assert dataframeClass.df.loc[dataframeClass.df['ID1'] == "value2"]["ID3"].item() == "value2 value5" 

170 

171 dataframeClass.perform_column_transformations(TransformationType.post) 

172 assert len(dataframeClass.df) == 2 

173 assert (list(dataframeClass.df.columns.values)) == ['ID1', 'ID2', 'ID3', 'ID4']