Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1import logging 

2import sys 

3import traceback 

4 

5import pandas as pd 

6 

7from sdc_etl_libs.sdc_data_schema.schema_toolbox import SchemaToolbox 

8from sdc_etl_libs.sdc_dataframe.Dataframe import Dataframe 

9from sdc_etl_libs.sdc_dataframe.SDCDataframeEnums import SDCDFTypes 

10from sdc_etl_libs.sdc_file_helpers.SDCFile import SDCFile 

11 

12 

13class SDCExcelFile(SDCFile): 

14 

15 def __init__(self, schema_, endpoint_schema_, file_name_, file_path_, file_obj_): 

16 

17 super(SDCExcelFile, self).__init__(schema_, endpoint_schema_, file_name_, file_path_, file_obj_) 

18 

19 if self.args is None or self.args == {}: 

20 raise Exception("Missing Excel Args") 

21 

22 self.__process_args() 

23 

24 def __process_args(self): 

25 self.args["header"] = 0 if self.args.get("header") else None 

26 self.args["sheet_identifiers"] = None if self.args.get("process_all_sheets") else self.args["sheet_identifiers"] 

27 if self.args["header"] == 0: 

28 self.args["skip_n_first_rows"] = list(range(1, self.args["skip_n_first_rows"] + 1)) 

29 elif self.args["header"] is None: 

30 self.args["skip_n_first_rows"] = list(range(0, self.args["skip_n_first_rows"])) 

31 

32 def get_file_as_dataframe(self): 

33 """ 

34 Loads an Excel file object into an SDCDataframe. The SDCExcelFile reads the following properties from 

35 the json schema: 

36 - headers: boolean - to include or not the headers (default True) 

37 - process_all_sheets: boolean - to process or not all the sheets (default True) 

38 - sheet_identifiers: list - to process specific sheets of the Excel file (identifiable by index or name) 

39 - skip_n_first_rows: int - to skip the first n rows from the Excel file (default 0) 

40 - parse_n_rows: int - to only parse the first n rows (default None) 

41 :return: SDCDataframe 

42 """ 

43 

44 file_obj = self.file_obj 

45 

46 try: 

47 df = Dataframe(SDCDFTypes.PANDAS, self.schema) 

48 pandas_df = pd.read_excel( 

49 file_obj, 

50 header=self.args["header"], 

51 sheet_name=self.args["sheet_identifiers"], 

52 skiprows=self.args["skip_n_first_rows"], 

53 nrows=self.args["parse_n_rows"]) 

54 collected = pd.DataFrame() 

55 for df_ in pandas_df.items(): 

56 collected = pd.concat([collected, df_[1]], axis=0) 

57 df.process_df(collected) 

58 return df 

59 

60 except Exception as e: 

61 traceback.print_exc(file=sys.stdout) 

62 logging.error(e) 

63 logging.error(f"Failed loading Excel data to SDCDataframe. {e}") 

64 return None