Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1import io 

2import json 

3import os 

4 

5import pytest 

6from sdc_etl_libs.sdc_data_exchange.SDCDataExchange import SDCDataExchange 

7from sdc_etl_libs.sdc_data_schema.schema_toolbox import SchemaToolbox 

8from sdc_etl_libs.sdc_dataframe.Dataframe import Dataframe 

9from sdc_etl_libs.sdc_file_helpers.SDCFileFactory import SDCFileFactory 

10 

11 

12def excel_file_in_s3_to_snowflake(): 

13 """ 

14 Kind of integration test. Here we move the content of an excel file in S3 to Snowflake 

15 Look at the json schema <_Tests/test_schema_excel_file> for more details 

16 """ 

17 exchange = SDCDataExchange("_Tests/test_schema_excel_file", "main_source", "SDC_sink_0") 

18 result = exchange.exchange_data() 

19 

20 

21def load_excel_file(source_tag: str): 

22 """ 

23 

24 @param source_tag: the tag of endpoint that is going to be used on the test 

25 @return: SDCExcelFile 

26 """ 

27 file = open(os.path.dirname(os.path.abspath(__file__)) + "/test_email_report.xlsx", "rb") 

28 file_object = io.BytesIO(file.read()) 

29 test_schema_1 = json.loads(open(os.path.dirname(os.path.abspath(__file__)) + "/test_schema_excel_file.json").read()) 

30 ep_schema_1 = SchemaToolbox.get_endpoint_data_from_schema(test_schema_1, source_tag, validate_=True) 

31 sdc_file = SDCFileFactory.get_file(test_schema_1, ep_schema_1, "'[0].Test Email Report.xlsx'", "excel/", 

32 file_object) 

33 return sdc_file 

34 

35 

36def test_parsing_excel_file_with_default_values(): 

37 """ 

38 Testing excel default read values = select all the rows from the first tab with header 

39 """ 

40 sdc_file = load_excel_file("main_source") 

41 df = sdc_file.get_file_as_dataframe() 

42 assert isinstance(df, Dataframe) 

43 assert df.df is not None 

44 assert df.df.shape == (14, 20) 

45 assert len(list(df.df.columns.values)) > 0 

46 

47 

48def test_parsing_excel_file_with_no_header(): 

49 """ 

50 Reading excel file without header. That means the test is going to throw an exception because the framework 

51 tries to parse the headers as they where part of the data 

52 """ 

53 sdc_file = load_excel_file("no_header") 

54 with pytest.raises(Exception) as e: 

55 assert sdc_file.get_file_as_dataframe() 

56 

57 

58def test_parsing_excel_file_with_no_header_skip_headers(): 

59 """ 

60 Reading excel file without header and skipping this row. This is going to fail even when the headers are skipped. 

61 That's because we need them in order to create the SDCDataFrame 

62 """ 

63 sdc_file = load_excel_file("no_header_and_skip") 

64 with pytest.raises(Exception) as e: 

65 assert sdc_file.get_file_as_dataframe() 

66 

67 

68def test_parsing_excel_file_skip_rows(): 

69 """ 

70 Reading excel file and skipping the first 10 rows out of 14 

71 """ 

72 sdc_file = load_excel_file("skip_rows") 

73 df = sdc_file.get_file_as_dataframe() 

74 assert df.df is not None 

75 assert df.df.shape == (4, 20) 

76 assert len(list(df.df.columns.values)) > 0 

77 

78 

79def test_parsing_excel_file_process_first_n_rows(): 

80 """ 

81 Reading excel file and parsing only the first 13 rows out of 14 

82 """ 

83 sdc_file = load_excel_file("first_n_rows") 

84 df = sdc_file.get_file_as_dataframe() 

85 assert df.df is not None 

86 assert df.df.shape == (13, 20) 

87 assert len(list(df.df.columns.values)) > 0 

88 

89 

90def test_parsing_excel_file_select_other_sheet(): 

91 """ 

92 Reading the second tab/sheet present on the excel file 

93 """ 

94 sdc_file = load_excel_file("select_other_sheet") 

95 df = sdc_file.get_file_as_dataframe() 

96 assert df.df is not None 

97 assert df.df.shape == (3, 20) 

98 assert len(list(df.df.columns.values)) > 0 

99 

100 

101def test_parsing_excel_file_multiple_sheets(): 

102 """ 

103 Reading the first two tabs/sheets 

104 """ 

105 sdc_file = load_excel_file("select_multiple_sheets") 

106 df = sdc_file.get_file_as_dataframe() 

107 assert df.df is not None 

108 assert df.df.shape == (17, 20) 

109 assert len(list(df.df.columns.values)) > 0 

110 

111 

112def test_parsing_excel_file_multiple_sheets_by_name(): 

113 """ 

114 Reading multiple tabs/sheets by name 

115 """ 

116 sdc_file = load_excel_file("select_multiple_sheets") 

117 df = sdc_file.get_file_as_dataframe() 

118 assert df.df is not None 

119 assert df.df.shape == (17, 20) 

120 assert len(list(df.df.columns.values)) > 0 

121 

122 

123def test_parsing_excel_file_multiple_sheets_by_name_and_ids(): 

124 """ 

125 Reading multiple tabs/sheets by name and ids 

126 """ 

127 sdc_file = load_excel_file("select_multiple_sheets_names_and_ids") 

128 df = sdc_file.get_file_as_dataframe() 

129 assert df.df is not None 

130 assert df.df.shape == (18, 20) 

131 assert len(list(df.df.columns.values)) > 0 

132 

133 

134def test_parsing_excel_file_all_sheets(): 

135 """ 

136 Reading all the tabs/sheets of the Excel file 

137 """ 

138 sdc_file = load_excel_file("all_sheets") 

139 df = sdc_file.get_file_as_dataframe() 

140 assert df.df is not None 

141 assert df.df.shape == (29, 20) 

142 assert len(list(df.df.columns.values)) > 0