Coverage for libs/sdc_etl_libs/sdc_file_helpers/SDCFileHelpers.py : 31%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1import csv
2import logging
3import os
4from datetime import datetime, timedelta
6from dateutil import parser
7from dateutil.relativedelta import relativedelta
10class SDCFileHelpers:
12 @staticmethod
13 def convert_file_to_flattened_dict(file_object_, file_type_='csv', delimiter_=',', column_header_list_=None):
14 """
15 Converts a file object to a list of flattened dictionaries.
17 :param file_object_: File object that contains the lines of data to
18 be processed.
19 :param file_type_: File type of object. Default = 'csv'.
20 :param delimiter_: File delimiter of object. Default = ','.
21 :param column_header_list_: List of columns headers. Default = None.
22 :return: List of flattened dictionaries.
23 """
25 output = []
27 if file_type_.lower() in ['txt', 'csv']:
28 try:
29 if column_header_list_:
30 reader = csv.DictReader(file_object_, delimiter=delimiter_, fieldnames=column_header_list_)
31 else:
32 reader = csv.DictReader(file_object_, delimiter=delimiter_)
34 except Exception as e:
35 logging.error(e)
36 logging.error(f"Failed flattening file.")
38 else:
39 raise Exception(f"Error flattening file to dict. '{file_type_}' "
40 f"currently not support in SDCFileHelpers")
42 for line in reader:
43 output.append(dict(line))
45 return output
47 @staticmethod
48 def get_file_path(type_, path_):
49 """
50 Returns absolute file path for file type and relative path provided.
51 Function uses this file a point of reference to determine the
52 right absolute path.
53 :param type_: Type of file to be retrieved. Options:
54 'schema': Returns schema from schema directory.
55 'sql': Returns sql from sql directory.
56 :param path_: Path to file from the desired type_.
57 :return: Path to file as string.
58 """
60 if type_ == 'schema':
61 file_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', 'schemas', path_)
63 elif type_ == 'sql':
64 file_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', 'sql', path_)
66 elif type_ == 'metadata':
67 file_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', 'metadata', path_)
69 elif type_ == 'template':
70 file_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', 'templates', path_)
72 else:
73 raise Exception("File type for get_file_path not supported.")
75 if not os.path.exists(file_path):
76 raise Exception("File provided does not exist.")
78 return file_path
80 @staticmethod
81 def compare_json_schema(obj1_, obj2_, format_="json"):
82 """
83 Compares two JSON-like objects together to determine if they are the same or different.
84 :param obj1_: First object to compare.
85 :param obj2_: Second object to compare.
86 :param format_: json/list. Format of obj1_ and obj2_ being passed in.
87 :return: Boolean. False for no difference detected. True for differences found.
88 """
90 if len(obj1_) != len(obj2_):
91 logging.info(f"NUMBER OF KEYS DIDN'T MATCH!!")
92 return True
94 if format_ == "json":
95 for key, value in obj1_.items():
96 if key not in obj2_:
97 logging.info(f"KEY '{key}' NOT FOUND!!")
98 return True
99 elif key in obj2_ and obj1_[key] == obj2_[key]:
100 logging.info(f"KEY '{key}' MATCH FOUND!")
101 continue
102 elif isinstance(obj1_[key], dict) and isinstance(obj2_[key], dict):
103 if SDCFileHelpers.compare_json_schema(obj1_[key], obj2_[key]) == 1:
104 return True
105 elif isinstance(obj1_[key], list) and isinstance(obj2_[key], list):
106 if SDCFileHelpers.compare_json_schema(obj1_[key], obj2_[key], "list") == 1:
107 return True
108 elif type(obj1_[key]) != type(obj2_[key]):
109 logging.info(f"KEY '{key}' DIDN'T MATCH!!")
110 return True
111 elif value != obj2_[key]:
112 logging.info(f"KEY '{key}' DIDN'T MATCH!!")
113 return True
114 elif format_ == "list":
115 for key, val in enumerate(obj1_):
116 if val not in obj2_:
117 logging.info(f"KEY '{key}' NOT FOUND!!")
118 return True
119 for key, val in enumerate(obj2_):
120 if val not in obj1_:
121 logging.info(f"KEY '{key}' NOT FOUND!!")
122 return True
123 return False
125 @staticmethod
126 def get_path_on_date_hive_partitions(prefix_,
127 date_hive_partitions_,
128 partition_to_process_,
129 run_datetime_,
130 time_unit_look_back_: int = 0):
131 """
132 Creates a complete path including calculated date hive partition to be processed.
133 :param prefix_: The actual set prefix, before date hive partitions.
134 :param date_hive_partitions_: List. Ordered list with the present date hive partitions in the path to be read.
135 :partition_to_process_: One of the date hive partitions from the list provided in the parameter date_hive_partitions_.
136 :param run_datetime_: String or Datetime object. Datetime to generate partition_to_process_ from.
137 :param time_unit_look_back_: Int. Time unit to subtract from run run_datetime_. Default = 0.
138 :return: complete_path. Complete path concatenating set prefix and calculated date hive partition to be processed.
139 """
140 datetime_to_process = SDCFileHelpers.get_datetime_to_process(partition_to_process_, run_datetime_,
141 time_unit_look_back_)
142 year = "year=%s/" % (datetime_to_process.strftime("%Y"))
143 month = "month=%s/" % (datetime_to_process.strftime("%m"))
144 day = "day=%s/" % (datetime_to_process.strftime("%d"))
145 hour = "hour=%s/" % (datetime_to_process.strftime("%H"))
146 date = "date=%s/" % (datetime_to_process.strftime("%Y-%m-%d"))
148 complete_path = prefix_
149 for hive_partition in date_hive_partitions_:
150 if hive_partition == "year":
151 complete_path += year
152 elif hive_partition == "month":
153 complete_path += month
154 elif hive_partition == "day":
155 complete_path += day
156 elif hive_partition == "hour":
157 complete_path += hour
158 elif hive_partition == "date":
159 complete_path += date
160 if hive_partition == partition_to_process_:
161 return complete_path
163 @staticmethod
164 def get_datetime_to_process(partition_to_process_, run_datetime_, time_unit_look_back_: int = 0):
165 """
166 Calculates the appropriate truncated datetime from datetime_ passed in.
167 :param partition_to_process_: The base time part, to subtract one unit from current date time value.
168 :param run_datetime_: String or Datetime object. Datetime to generate partition_to_process_ from.
169 :param time_unit_look_back_: Int. Time unit to subtract from run run_datetime_. Deafault = 0.
170 :return: A datetime object.
171 """
173 if type(run_datetime_) == str:
174 run_datetime_ = parser.parse(run_datetime_)
176 try:
177 partition_to_process = partition_to_process_.lower()
178 if partition_to_process == "hour":
179 return run_datetime_ - timedelta(hours=time_unit_look_back_)
180 elif partition_to_process == "day":
181 return run_datetime_ - timedelta(days=time_unit_look_back_)
182 elif partition_to_process == "month":
183 return datetime.combine(run_datetime_ + relativedelta(months=-time_unit_look_back_),
184 datetime.min.time())
185 elif partition_to_process == "year":
186 return datetime.combine(run_datetime_ + relativedelta(years=-time_unit_look_back_), datetime.min.time())
187 elif partition_to_process == "date":
188 return run_datetime_ - timedelta(days=time_unit_look_back_)
190 except Exception as e:
191 logging.info(f"The given partition_to_process_ {partition_to_process_} is not in the options.")
193 @staticmethod
194 def is_file_endpoint_type(endpoint_type):
195 """
196 Determines if the source is file endpoint.
197 :param endpoint_type: Source Endpoint type.
198 :return: boolean.
199 """
200 if endpoint_type in ["s3", "sftp"]:
201 return True
202 return False