Coverage for libs/sdc_etl_libs/sdc_file_helpers/SDCJsonFile.py : 65%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1import gzip
2import io
3import logging
5import pandas as pd
7from sdc_etl_libs.sdc_data_schema.schema_toolbox import SchemaToolbox
8from sdc_etl_libs.sdc_dataframe.Dataframe import Dataframe
9from sdc_etl_libs.sdc_dataframe.SDCDataframeEnums import SDCDFTypes
10from sdc_etl_libs.sdc_file_helpers.SDCFile import SDCFile
11from sdc_etl_libs.sdc_file_helpers.SDCJsonFileHelpers import SDCJsonFileHelpers
14class SDCJsonFile(SDCFile):
15 type = None
16 file_name = None
17 file_path = None
18 file_obj = None
19 schema = None
20 endpoint_schema = None
21 df = None
22 endpoint_type = None
24 def __init__(self, schema_, endpoint_schema_, file_name_, file_path_, file_obj_):
25 """
26 Creates an SDCJsonFile object representing a JOSN file.
27 :param schema_: Json schema of the data.
28 :param endpoint_schema_: The endpoint schema of the source
29 :param file_name_: Name of the file
30 :param file_path_: Path you wish to write too
31 :param file_obj_: Raw file object
32 :return: SDCFILE Json File
33 """
34 super(SDCJsonFile, self).__init__(schema_, endpoint_schema_, file_name_, file_path_, file_obj_)
35 self.custom_pre_processing_functions = self.ep_file_info_opts.get("custom_pre_processing_functions")
36 self.args = {}
38 self.args = SchemaToolbox.generate_file_output_args(self.schema, self.endpoint_schema)
39 if self.args is None or self.args == {}:
40 raise Exception("Missing JSON Args.")
42 def get_file_size(self):
43 """
44 Gets the total number of records in a file.
45 :return: Int. Number of records.
46 """
48 if "lines" in self.args.keys():
49 if self.args["lines"]:
50 f = self.get_file_as_object()
51 if self.compression_type == "gzip":
52 f = io.BufferedReader(f)
53 record_count = 0
54 for line in f:
55 record_count += 1
56 f.seek(0)
57 logging.info(f"File contains {record_count:,} record(s).")
58 return record_count
59 else:
60 #TODO implement multiline json objects count
61 pass
62 else:
63 logging.error(f"Lines is mandatoy to get record count.")
64 return None
66 def get_file_as_object(self):
67 """
68 Creates SDCDataframe from file data and returns an in-memory, file-like object.
69 :return: Data from file as StringIO object.
70 """
71 if self.compression_type == "gzip":
72 try:
73 return io.BytesIO(gzip.open(self.file_obj).read())
74 except OSError as e:
75 if "Not a gzipped file" in str(e):
76 return self.file_obj
77 else:
78 logging.error(f"Failed get file as object. {e}")
79 raise e
80 else:
81 return self.file_obj
83 def get_file_as_dataframe(self):
84 """
85 Converts a json file object to an SDCDataframe.
86 :return: A fully processed SDCDataframe.
87 """
88 try:
89 if self.custom_pre_processing_functions:
90 for function in self.custom_pre_processing_functions:
91 pandas_df = SDCJsonFileHelpers.apply_custom_pre_processing_function(
92 file_obj_=self.get_file_as_object(), function_name_=function["name"], opts_=function["opts"])
93 else:
94 self.file_obj.seek(0)
95 pandas_df = pd.read_json(self.file_obj, **self.args)
97 df = Dataframe(None, self.schema)
98 df.process_df(pandas_df)
99 return df
101 except Exception as e:
102 logging.error(f"Failed loading JSON data to SDCDataframe. {e}")
103 return None