Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1 

2import gzip 

3import io 

4import json 

5import os 

6import types 

7import pytest 

8from sdc_etl_libs.sdc_file_helpers.SDCFileFactory import SDCFileFactory 

9from sdc_etl_libs.sdc_data_schema.schema_toolbox import SchemaToolbox 

10from sdc_etl_libs.sdc_dataframe.Dataframe import Dataframe 

11 

12test_schema_1 = json.loads(open(os.path.dirname(os.path.abspath(__file__)) + "/test_schema_csv_gzip_file_1.json").read()) 

13ep_schema_1 = SchemaToolbox.get_endpoint_data_from_schema(test_schema_1, "main_source", validate_=True) 

14 

15test_schema_2 = json.loads(open(os.path.dirname(os.path.abspath(__file__)) + "/test_schema_csv_file_1.json").read()) 

16ep_schema_2 = SchemaToolbox.get_endpoint_data_from_schema(test_schema_2, "main_source", validate_=True) 

17 

18test_schema_3 = json.loads(open(os.path.dirname(os.path.abspath(__file__)) + "/test_schema_csv_file_1.json").read()) 

19ep_schema_3 = SchemaToolbox.get_endpoint_data_from_schema(test_schema_3, "main_source_no_headers", validate_=True) 

20 

21test_schema_4 = json.loads(open(os.path.dirname(os.path.abspath(__file__)) + "/test_schema_csv_gzip_file_1.json").read()) 

22ep_schema_4 = SchemaToolbox.get_endpoint_data_from_schema(test_schema_4, "main_source_no_headers", validate_=True) 

23 

24test_schema_5 = json.loads(open(os.path.dirname(os.path.abspath(__file__)) + "/test_schema_csv_file_5.json").read()) 

25ep_schema_5 = SchemaToolbox.get_endpoint_data_from_schema(test_schema_5, "main_source", validate_=True) 

26 

27csv_in_headers = 'DATE,ADVERTISER_NAME,FLIGHT_UID,FLIGHT_NAME,SPEND,IMPRESSIONS,CLICKS\n' \ 

28 '2020-02-24,CompanyABC,uuid2039,flightname2039,0.00,100,1000\n' \ 

29 '2020-02-25,CompanyABC,uuid2039,flightname2039,1.00,100,2000\n' \ 

30 '2020-02-26,CompanyABC,uuid2039,flightname2039,2.00,100,3000\n' \ 

31 '2020-02-27,CompanyABC,uuid2039,flightname2039,3.00,100,4000\n' \ 

32 '2020-02-28,CompanyABC,uuid2039,flightname2039,4.00,100,5000\n' \ 

33 '2020-02-29,CompanyABC,uuid2039,flightname2039,5.00,100,6000' 

34 

35csv_in_no_headers = '2020-02-24,CompanyABC,uuid2039,flightname2039,0.00,100,1000\n' \ 

36 '2020-02-25,CompanyABC,uuid2039,flightname2039,1.00,100,2000\n' \ 

37 '2020-02-26,CompanyABC,uuid2039,flightname2039,2.00,100,3000\n' \ 

38 '2020-02-27,CompanyABC,uuid2039,flightname2039,3.00,100,4000\n' \ 

39 '2020-02-28,CompanyABC,uuid2039,flightname2039,4.00,100,5000\n' \ 

40 '2020-02-29,CompanyABC,uuid2039,flightname2039,5.00,100,6000' 

41 

42gzip_in_headers = 'act_date evt_date activity_type act_channel act_e1_segment act_custom1 act_custom6 act_user_segment evt_type_class evt_campaign_key evt_placement_key evt_creative_key evt_keyword evt_source evt_e1_segment ad_buy_placement ad_format buy_method ad_buy_channel tactic targeting_type targeting_category targeting_segment evt_user_segment media_channel evt_publisher_name evt_placement_name evt_creative_name evt_campaign_name evt_advertiser_name evt_placement_id evt_campaign_id evt_publisher_id evt_creative_id geo device age gender evt_geo_name evt_device_type_name full_ia_count ia_count\n' \ 

43 '2020-02-24 2020-01-10 LEAD Online 103 Direct google Untapped Millennials IM c57cf3400d98350e15fbaa969ef51b40 cb4e4cf45dcd92b594d0b0576d4ab10a 730d5623996c404f496e425178467167 \\N DSDK_PINTEREST_IMPRESSIONS 103 NA NA NA NA NA NA NA NA Untapped Millennials Social Pinterest oCPM Video | 30-30-1 | Sales | CRM Onboarding Leads 120 Day Old + | Mobile BNS-NCA Split | Kiara | 3D Smile | 8sec | 9x16 Video | Straighten while you sleep | 60% less Desc | QUIZ CTA | LP-SA | 10.25.19 oCPM Video | 30-30-1 | Sales | CRM Lead Retargeting | 8.30.19 Smile Direct Club Pinterest 2680064269157 626740993226 4101381764522768493 687202972135 NA NA NA NA United States Smartphone 0.020010357062112277 0.014099872824662636\n' \ 

44 '2020-02-24 2020-01-10 LEAD Online 129 Direct sfmc Untapped Millennials IM c57cf3400d98350e15fbaa969ef51b40 cb4e4cf45dcd92b594d0b0576d4ab10a bcb23abbb0de703f4464be090943a605 \\N DSDK_PINTEREST_IMPRESSIONS 170 NA NA NA NA NA NA NA NA Untapped Millennials Social Pinterest oCPM Video | 30-30-1 | Sales | CRM Onboarding Leads 120 Day Old + | Mobile Influencer UGC | Olivia Vargus | 15 sec | 9x16 Video | 60% less | QUIZ CTA | LP-SA | 10.14.19 oCPM Video | 30-30-1 | Sales | CRM Lead Retargeting | 8.30.19 Smile Direct Club Pinterest 2680064269157 626740993226 4101381764522768493 687202841087 NA NA NA NA United States Smartphone 0.008716941487587103 0.004435280393289234\n' \ 

45 '2020-02-24 2020-01-10 LEAD Online 132 Insurance google Image Conscious Singles IM 5d8f231adcf22f6892bbc32fc87f8a13 268e0706dd469c472be1f1335f972546 17e8bc94b09dcc50fc3dcd7c80a23d4a \\N DSDK_PINTEREST_IMPRESSIONS 123 NA NA NA NA NA NA NA NA FreeSpirited Families Social Pinterest oCPM | 7-7-7 | Leads | All-Time | Aligner AAL 10% | Mobile Polaroid+TIAA | UGC B&A FF | Ashleigh_Huber | 6 months | 1x2 | GS CTA | LP-SA | 7.10.19 oCPM | 7-7-7 | Leads | AAL | National | Prospecting | 9.12.19 Smile Direct Club Pinterest 2680064647656 626741028935 4101381764522768493 687203237545 NA NA NA NA United States Smartphone 0.009099412899169445 0.009029375325115193\n' \ 

46 '2020-02-24 2020-01-10 LEAD Online 141 Direct facebook FreeSpirited Families IM 1ef3a04f995e11853e6afb7c9f41bad4 714311bc1f88bdeb6a3b1ac0104a6e0f a5ad4372558b631e54257046f78a85d4 \\N DSDK_PINTEREST_IMPRESSIONS 127 NA NA NA NA NA NA NA NA FreeSpirited Families Social Pinterest oCPM | 7-0-0 | Leads | All Site Visitor Retargeting | Last 180 Days | Mobile Influencer UGC | NoisyButters | 15 sec | 9x16 Video | 60% less | QUIZ CTA | LP-SA | 12.3.19 oCPM | 7-0-0 | Leads | All Site Visitor Retargeting | National | 10.1.19 Smile Direct Club Pinterest 2680064408332 626741085273 4101381764522768493 687203459415 NA NA NA NA United States Smartphone 0.2727272727272727 0.07209040799742467\n' \ 

47 '2020-02-24 2020-01-10 LEAD Online 35 Direct facebook Influential Families IM 40191ed7cc8218198e1a5e3fcd1c6b1a 7260d0c1124fc54cc57746d0774ddb14 85c997f9903c31272c137b821ef06e8a \\N DSDK_PINTEREST_IMPRESSIONS 110 NA NA NA NA NA NA NA NA Content with Life Social Pinterest oCPM | 30-30-1 | Sales | 1T | Neustar | Young Pioneers | Mobile UGC Aligner in Mouth | Alyssa BF A | Clear aligners | 9x16 | GS CTA | LP-SA | 7.10.19 oCPM-Beta | 30-30-1 | Sales | 1T | Neustar | National | Prospecting | 1.4.19 Smile Direct Club Pinterest 2680063304980 626740282356 4101381764522768493 687201801268 NA NA NA NA United States Smartphone 0.3333333333333333 0.08899940734971507\n' 

48 

49gzip_in_no_headers = '2020-02-24 2020-01-10 LEAD Online 103 Direct google Untapped Millennials IM c57cf3400d98350e15fbaa969ef51b40 cb4e4cf45dcd92b594d0b0576d4ab10a 730d5623996c404f496e425178467167 \\N DSDK_PINTEREST_IMPRESSIONS 103 NA NA NA NA NA NA NA NA Untapped Millennials Social Pinterest oCPM Video | 30-30-1 | Sales | CRM Onboarding Leads 120 Day Old + | Mobile BNS-NCA Split | Kiara | 3D Smile | 8sec | 9x16 Video | Straighten while you sleep | 60% less Desc | QUIZ CTA | LP-SA | 10.25.19 oCPM Video | 30-30-1 | Sales | CRM Lead Retargeting | 8.30.19 Smile Direct Club Pinterest 2680064269157 626740993226 4101381764522768493 687202972135 NA NA NA NA United States Smartphone 0.020010357062112277 0.014099872824662636\n' \ 

50 '2020-02-24 2020-01-10 LEAD Online 129 Direct sfmc Untapped Millennials IM c57cf3400d98350e15fbaa969ef51b40 cb4e4cf45dcd92b594d0b0576d4ab10a bcb23abbb0de703f4464be090943a605 \\N DSDK_PINTEREST_IMPRESSIONS 170 NA NA NA NA NA NA NA NA Untapped Millennials Social Pinterest oCPM Video | 30-30-1 | Sales | CRM Onboarding Leads 120 Day Old + | Mobile Influencer UGC | Olivia Vargus | 15 sec | 9x16 Video | 60% less | QUIZ CTA | LP-SA | 10.14.19 oCPM Video | 30-30-1 | Sales | CRM Lead Retargeting | 8.30.19 Smile Direct Club Pinterest 2680064269157 626740993226 4101381764522768493 687202841087 NA NA NA NA United States Smartphone 0.008716941487587103 0.004435280393289234\n' \ 

51 '2020-02-24 2020-01-10 LEAD Online 132 Insurance google Image Conscious Singles IM 5d8f231adcf22f6892bbc32fc87f8a13 268e0706dd469c472be1f1335f972546 17e8bc94b09dcc50fc3dcd7c80a23d4a \\N DSDK_PINTEREST_IMPRESSIONS 123 NA NA NA NA NA NA NA NA FreeSpirited Families Social Pinterest oCPM | 7-7-7 | Leads | All-Time | Aligner AAL 10% | Mobile Polaroid+TIAA | UGC B&A FF | Ashleigh_Huber | 6 months | 1x2 | GS CTA | LP-SA | 7.10.19 oCPM | 7-7-7 | Leads | AAL | National | Prospecting | 9.12.19 Smile Direct Club Pinterest 2680064647656 626741028935 4101381764522768493 687203237545 NA NA NA NA United States Smartphone 0.009099412899169445 0.009029375325115193\n' \ 

52 '2020-02-24 2020-01-10 LEAD Online 141 Direct facebook FreeSpirited Families IM 1ef3a04f995e11853e6afb7c9f41bad4 714311bc1f88bdeb6a3b1ac0104a6e0f a5ad4372558b631e54257046f78a85d4 \\N DSDK_PINTEREST_IMPRESSIONS 127 NA NA NA NA NA NA NA NA FreeSpirited Families Social Pinterest oCPM | 7-0-0 | Leads | All Site Visitor Retargeting | Last 180 Days | Mobile Influencer UGC | NoisyButters | 15 sec | 9x16 Video | 60% less | QUIZ CTA | LP-SA | 12.3.19 oCPM | 7-0-0 | Leads | All Site Visitor Retargeting | National | 10.1.19 Smile Direct Club Pinterest 2680064408332 626741085273 4101381764522768493 687203459415 NA NA NA NA United States Smartphone 0.2727272727272727 0.07209040799742467\n' \ 

53 '2020-02-24 2020-01-10 LEAD Online 35 Direct facebook Influential Families IM 40191ed7cc8218198e1a5e3fcd1c6b1a 7260d0c1124fc54cc57746d0774ddb14 85c997f9903c31272c137b821ef06e8a \\N DSDK_PINTEREST_IMPRESSIONS 110 NA NA NA NA NA NA NA NA Content with Life Social Pinterest oCPM | 30-30-1 | Sales | 1T | Neustar | Young Pioneers | Mobile UGC Aligner in Mouth | Alyssa BF A | Clear aligners | 9x16 | GS CTA | LP-SA | 7.10.19 oCPM-Beta | 30-30-1 | Sales | 1T | Neustar | National | Prospecting | 1.4.19 Smile Direct Club Pinterest 2680063304980 626740282356 4101381764522768493 687201801268 NA NA NA NA United States Smartphone 0.3333333333333333 0.08899940734971507\n' 

54 

55 

56def test_entire_csv_load_into_dataframe(): 

57 """ 

58 Ensure loading an entire csv file obj into an SDCDataframe works. 

59 """ 

60 

61 s_buf = io.StringIO(csv_in_headers) 

62 sdc_file = SDCFileFactory.get_file(test_schema_2, ep_schema_2, "neustar-file-name.csv", "fact-funnel-path/", s_buf) 

63 df = sdc_file.get_file_as_dataframe() 

64 

65 #Assert all six recrods made it into an SDCDataframe 

66 assert df is not None 

67 assert isinstance(df, Dataframe) 

68 assert df.df is not None 

69 assert len(df.df) == 6 

70 

71 

72def test_chunked_csv_load_into_dataframe(): 

73 """ 

74 Ensure chunking over a csv file obj works. 

75 """ 

76 

77 s_buf = io.StringIO(csv_in_headers) 

78 sdc_file = SDCFileFactory.get_file(test_schema_2, ep_schema_2, "neustar-file-name.csv", "fact-funnel-path/", s_buf) 

79 gen = sdc_file.get_file_as_dataframe(chunksize_=3) 

80 assert isinstance(gen, types.GeneratorType) 

81 

82 # First yield of three records 

83 df = next(gen) 

84 assert isinstance(df, Dataframe) 

85 assert df.df is not None 

86 assert len(df.df) == 3 

87 

88 # Second yield of three records 

89 df = next(gen) 

90 assert isinstance(df, Dataframe) 

91 assert df.df is not None 

92 assert len(df.df) == 3 

93 

94 # Ensure StopIteration error is raised since there are no more records to yield 

95 with pytest.raises(StopIteration): 

96 df = next(gen) 

97 

98 

99def test_entire_gzip_load_into_dataframe(): 

100 """ 

101 Ensure loading an entire csv gzipped file obj into an SDCDataframe works. 

102 """ 

103 

104 gzip_obj = gzip.compress(bytes(gzip_in_headers, encoding='utf-8')) 

105 s_buf = io.BytesIO(gzip_obj) 

106 sdc_file = SDCFileFactory.get_file(test_schema_1, ep_schema_1, "neustar-file-name.csv", "fact-funnel-path/", s_buf) 

107 df = sdc_file.get_file_as_dataframe() 

108 

109 # Assert SDCDataframe is returned and contains the 5 records 

110 assert isinstance(df, Dataframe) 

111 assert df.df is not None 

112 assert len(df.df) == 5 

113 

114 

115def test_chunked_gzip_load_into_dataframe(): 

116 """ 

117 Ensure chunking over a csv gzipped file obj works. 

118 """ 

119 

120 gzip_obj = gzip.compress(bytes(gzip_in_headers, encoding='utf-8')) 

121 s_buf = io.BytesIO(gzip_obj) 

122 sdc_file = SDCFileFactory.get_file(test_schema_1, ep_schema_1, "neustar-file-name.csv", "fact-funnel-path/", s_buf) 

123 gen = sdc_file.get_file_as_dataframe(chunksize_=2) 

124 assert isinstance(gen, types.GeneratorType) 

125 

126 # First yield of two records 

127 df = next(gen) 

128 assert isinstance(df, Dataframe) 

129 assert df.df is not None 

130 assert len(df.df) == 2 

131 

132 # Second yield of two records 

133 df = next(gen) 

134 assert isinstance(df, Dataframe) 

135 assert df.df is not None 

136 assert len(df.df) == 2 

137 

138 # Third yield of last remaining record 

139 df = next(gen) 

140 assert isinstance(df, Dataframe) 

141 assert df.df is not None 

142 assert len(df.df) == 1 

143 

144 # Assert that StopIteration error is raised as there are no more records to yield 

145 with pytest.raises(StopIteration): 

146 df = next(gen) 

147 

148def test_get_file_size_for_csv_file(): 

149 """ 

150 Ensure counting lines from a csv file obj works. 

151 """ 

152 

153 s_buf = io.StringIO(csv_in_headers) 

154 sdc_file = SDCFileFactory.get_file(test_schema_2, ep_schema_2, "neustar-file-name.csv", "fact-funnel-path/", s_buf) 

155 count = sdc_file.get_file_size() 

156 assert count == 6 

157 

158def test_get_file_size_for_csv_file_with_no_headers(): 

159 """ 

160 Ensure count of records in csv file is accurate if header not present in source data. 

161 """ 

162 

163 s_buf = io.StringIO(csv_in_no_headers) 

164 sdc_file = SDCFileFactory.get_file(test_schema_3, ep_schema_3, "neustar-file-name.csv", "fact-funnel-path/", s_buf) 

165 count = sdc_file.get_file_size() 

166 assert count == 6 

167 

168def test_get_file_size_for_gzip_file(): 

169 """ 

170 Ensure counting lines from a csv gzip file obj works. 

171 """ 

172 

173 gzip_obj = gzip.compress(bytes(gzip_in_headers, encoding='utf-8')) 

174 s_buf = io.BytesIO(gzip_obj) 

175 

176 sdc_file = SDCFileFactory.get_file(test_schema_1, ep_schema_1, "neustar-file-name.csv", "fact-funnel-path/", s_buf) 

177 count = sdc_file.get_file_size() 

178 assert count == 5 

179 

180def test_get_file_size_for_gzip_file_with_no_headers(): 

181 """ 

182 Ensure count of records in gzip file is accurate if header not present in source data. 

183 """ 

184 

185 gzip_obj = gzip.compress(bytes(gzip_in_no_headers, encoding='utf-8')) 

186 s_buf = io.BytesIO(gzip_obj) 

187 

188 sdc_file = SDCFileFactory.get_file(test_schema_4, ep_schema_4, "neustar-file-name.csv", "fact-funnel-path/", s_buf) 

189 count = sdc_file.get_file_size() 

190 assert count == 5 

191 

192def test_entire_csv_load_into_dataframe_with_a_function_preprocessing(): 

193 """ 

194 Ensure loading an entire csv file obj into an SDCDataframe 

195 with a preprocessing function works correctly. 

196 """ 

197 

198 s_buf = io.StringIO(csv_in_headers) 

199 sdc_file = SDCFileFactory.get_file(test_schema_5, ep_schema_5, "neustar-file-name.csv", "fact-funnel-path/", s_buf) 

200 df = sdc_file.get_file_as_dataframe() 

201 

202 #Assert all six recrods made it into an SDCDataframe and the new column FLIGHT was created 

203 assert df is not None 

204 assert isinstance(df, Dataframe) 

205 assert df.df is not None 

206 assert len(df.df) == 6 

207 assert (list(df.df.columns.values)) == ['DATE', 'ADVERTISER_NAME', 'FLIGHT_UID', 'FLIGHT_NAME', 'SPEND', 'IMPRESSIONS', 'CLICKS', 'FLIGHT', '_ETL_FILENAME', '_SF_INSERTEDDATETIME'] 

208 assert df.df.loc[df.df['DATE'] == "2020-02-24"]["FLIGHT"].item() == "uuid2039 flightname2039"