Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1import gzip 

2import io 

3import logging 

4 

5import pandas as pd 

6 

7from sdc_etl_libs.sdc_data_schema.schema_toolbox import SchemaToolbox 

8from sdc_etl_libs.sdc_dataframe.Dataframe import Dataframe 

9from sdc_etl_libs.sdc_dataframe.SDCDataframeEnums import SDCDFTypes 

10from sdc_etl_libs.sdc_file_helpers.SDCFile import SDCFile 

11from sdc_etl_libs.sdc_file_helpers.SDCJsonFileHelpers import SDCJsonFileHelpers 

12 

13 

14class SDCJsonFile(SDCFile): 

15 type = None 

16 file_name = None 

17 file_path = None 

18 file_obj = None 

19 schema = None 

20 endpoint_schema = None 

21 df = None 

22 endpoint_type = None 

23 

24 def __init__(self, schema_, endpoint_schema_, file_name_, file_path_, file_obj_): 

25 """ 

26 Creates an SDCJsonFile object representing a JOSN file. 

27 :param schema_: Json schema of the data. 

28 :param endpoint_schema_: The endpoint schema of the source 

29 :param file_name_: Name of the file 

30 :param file_path_: Path you wish to write too 

31 :param file_obj_: Raw file object 

32 :return: SDCFILE Json File 

33 """ 

34 super(SDCJsonFile, self).__init__(schema_, endpoint_schema_, file_name_, file_path_, file_obj_) 

35 self.custom_pre_processing_functions = self.ep_file_info_opts.get("custom_pre_processing_functions") 

36 self.args = {} 

37 

38 self.args = SchemaToolbox.generate_file_output_args(self.schema, self.endpoint_schema) 

39 if self.args is None or self.args == {}: 

40 raise Exception("Missing JSON Args.") 

41 

42 def get_file_size(self): 

43 """ 

44 Gets the total number of records in a file. 

45 :return: Int. Number of records. 

46 """ 

47 

48 if "lines" in self.args.keys(): 

49 if self.args["lines"]: 

50 f = self.get_file_as_object() 

51 if self.compression_type == "gzip": 

52 f = io.BufferedReader(f) 

53 record_count = 0 

54 for line in f: 

55 record_count += 1 

56 f.seek(0) 

57 logging.info(f"File contains {record_count:,} record(s).") 

58 return record_count 

59 else: 

60 #TODO implement multiline json objects count 

61 pass 

62 else: 

63 logging.error(f"Lines is mandatoy to get record count.") 

64 return None 

65 

66 def get_file_as_object(self): 

67 """ 

68 Creates SDCDataframe from file data and returns an in-memory, file-like object. 

69 :return: Data from file as StringIO object. 

70 """ 

71 if self.compression_type == "gzip": 

72 try: 

73 return io.BytesIO(gzip.open(self.file_obj).read()) 

74 except OSError as e: 

75 if "Not a gzipped file" in str(e): 

76 return self.file_obj 

77 else: 

78 logging.error(f"Failed get file as object. {e}") 

79 raise e 

80 else: 

81 return self.file_obj 

82 

83 def get_file_as_dataframe(self): 

84 """ 

85 Converts a json file object to an SDCDataframe. 

86 :return: A fully processed SDCDataframe. 

87 """ 

88 try: 

89 if self.custom_pre_processing_functions: 

90 for function in self.custom_pre_processing_functions: 

91 pandas_df = SDCJsonFileHelpers.apply_custom_pre_processing_function( 

92 file_obj_=self.get_file_as_object(), function_name_=function["name"], opts_=function["opts"]) 

93 else: 

94 self.file_obj.seek(0) 

95 pandas_df = pd.read_json(self.file_obj, **self.args) 

96 

97 df = Dataframe(None, self.schema) 

98 df.process_df(pandas_df) 

99 return df 

100 

101 except Exception as e: 

102 logging.error(f"Failed loading JSON data to SDCDataframe. {e}") 

103 return None