Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1import csv 

2import logging 

3import os 

4from datetime import datetime, timedelta 

5 

6from dateutil import parser 

7from dateutil.relativedelta import relativedelta 

8 

9 

10class SDCFileHelpers: 

11 

12 @staticmethod 

13 def convert_file_to_flattened_dict(file_object_, file_type_='csv', delimiter_=',', column_header_list_=None): 

14 """ 

15 Converts a file object to a list of flattened dictionaries. 

16 

17 :param file_object_: File object that contains the lines of data to 

18 be processed. 

19 :param file_type_: File type of object. Default = 'csv'. 

20 :param delimiter_: File delimiter of object. Default = ','. 

21 :param column_header_list_: List of columns headers. Default = None. 

22 :return: List of flattened dictionaries. 

23 """ 

24 

25 output = [] 

26 

27 if file_type_.lower() in ['txt', 'csv']: 

28 try: 

29 if column_header_list_: 

30 reader = csv.DictReader(file_object_, delimiter=delimiter_, fieldnames=column_header_list_) 

31 else: 

32 reader = csv.DictReader(file_object_, delimiter=delimiter_) 

33 

34 except Exception as e: 

35 logging.error(e) 

36 logging.error(f"Failed flattening file.") 

37 

38 else: 

39 raise Exception(f"Error flattening file to dict. '{file_type_}' " 

40 f"currently not support in SDCFileHelpers") 

41 

42 for line in reader: 

43 output.append(dict(line)) 

44 

45 return output 

46 

47 @staticmethod 

48 def get_file_path(type_, path_): 

49 """ 

50 Returns absolute file path for file type and relative path provided. 

51 Function uses this file a point of reference to determine the 

52 right absolute path. 

53 :param type_: Type of file to be retrieved. Options: 

54 'schema': Returns schema from schema directory. 

55 'sql': Returns sql from sql directory. 

56 :param path_: Path to file from the desired type_. 

57 :return: Path to file as string. 

58 """ 

59 

60 if type_ == 'schema': 

61 file_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', 'schemas', path_) 

62 

63 elif type_ == 'sql': 

64 file_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', 'sql', path_) 

65 

66 elif type_ == 'metadata': 

67 file_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', 'metadata', path_) 

68 

69 elif type_ == 'template': 

70 file_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', 'templates', path_) 

71 

72 else: 

73 raise Exception("File type for get_file_path not supported.") 

74 

75 if not os.path.exists(file_path): 

76 raise Exception("File provided does not exist.") 

77 

78 return file_path 

79 

80 @staticmethod 

81 def compare_json_schema(obj1_, obj2_, format_="json"): 

82 """ 

83 Compares two JSON-like objects together to determine if they are the same or different. 

84 :param obj1_: First object to compare. 

85 :param obj2_: Second object to compare. 

86 :param format_: json/list. Format of obj1_ and obj2_ being passed in. 

87 :return: Boolean. False for no difference detected. True for differences found. 

88 """ 

89 

90 if len(obj1_) != len(obj2_): 

91 logging.info(f"NUMBER OF KEYS DIDN'T MATCH!!") 

92 return True 

93 

94 if format_ == "json": 

95 for key, value in obj1_.items(): 

96 if key not in obj2_: 

97 logging.info(f"KEY '{key}' NOT FOUND!!") 

98 return True 

99 elif key in obj2_ and obj1_[key] == obj2_[key]: 

100 logging.info(f"KEY '{key}' MATCH FOUND!") 

101 continue 

102 elif isinstance(obj1_[key], dict) and isinstance(obj2_[key], dict): 

103 if SDCFileHelpers.compare_json_schema(obj1_[key], obj2_[key]) == 1: 

104 return True 

105 elif isinstance(obj1_[key], list) and isinstance(obj2_[key], list): 

106 if SDCFileHelpers.compare_json_schema(obj1_[key], obj2_[key], "list") == 1: 

107 return True 

108 elif type(obj1_[key]) != type(obj2_[key]): 

109 logging.info(f"KEY '{key}' DIDN'T MATCH!!") 

110 return True 

111 elif value != obj2_[key]: 

112 logging.info(f"KEY '{key}' DIDN'T MATCH!!") 

113 return True 

114 elif format_ == "list": 

115 for key, val in enumerate(obj1_): 

116 if val not in obj2_: 

117 logging.info(f"KEY '{key}' NOT FOUND!!") 

118 return True 

119 for key, val in enumerate(obj2_): 

120 if val not in obj1_: 

121 logging.info(f"KEY '{key}' NOT FOUND!!") 

122 return True 

123 return False 

124 

125 @staticmethod 

126 def get_path_on_date_hive_partitions(prefix_, 

127 date_hive_partitions_, 

128 partition_to_process_, 

129 run_datetime_, 

130 time_unit_look_back_: int = 0): 

131 """ 

132 Creates a complete path including calculated date hive partition to be processed. 

133        :param prefix_: The actual set prefix, before date hive partitions. 

134        :param date_hive_partitions_: List. Ordered list with the present date hive partitions in the path to be read. 

135        :partition_to_process_: One of the date hive partitions from the list provided in the parameter date_hive_partitions_. 

136 :param run_datetime_: String or Datetime object. Datetime to generate partition_to_process_ from. 

137 :param time_unit_look_back_: Int. Time unit to subtract from run run_datetime_. Default = 0. 

138        :return: complete_path. Complete path concatenating set prefix and calculated date hive partition to be processed. 

139 """ 

140 datetime_to_process = SDCFileHelpers.get_datetime_to_process(partition_to_process_, run_datetime_, 

141 time_unit_look_back_) 

142 year = "year=%s/" % (datetime_to_process.strftime("%Y")) 

143 month = "month=%s/" % (datetime_to_process.strftime("%m")) 

144 day = "day=%s/" % (datetime_to_process.strftime("%d")) 

145 hour = "hour=%s/" % (datetime_to_process.strftime("%H")) 

146 date = "date=%s/" % (datetime_to_process.strftime("%Y-%m-%d")) 

147 

148 complete_path = prefix_ 

149 for hive_partition in date_hive_partitions_: 

150 if hive_partition == "year": 

151 complete_path += year 

152 elif hive_partition == "month": 

153 complete_path += month 

154 elif hive_partition == "day": 

155 complete_path += day 

156 elif hive_partition == "hour": 

157 complete_path += hour 

158 elif hive_partition == "date": 

159 complete_path += date 

160 if hive_partition == partition_to_process_: 

161 return complete_path 

162 

163 @staticmethod 

164 def get_datetime_to_process(partition_to_process_, run_datetime_, time_unit_look_back_: int = 0): 

165 """ 

166        Calculates the appropriate truncated datetime from datetime_ passed in. 

167 :param partition_to_process_: The base time part, to subtract one unit from current date time value. 

168 :param run_datetime_: String or Datetime object. Datetime to generate partition_to_process_ from. 

169 :param time_unit_look_back_: Int. Time unit to subtract from run run_datetime_. Deafault = 0. 

170 :return: A datetime object. 

171 """ 

172 

173 if type(run_datetime_) == str: 

174 run_datetime_ = parser.parse(run_datetime_) 

175 

176 try: 

177 partition_to_process = partition_to_process_.lower() 

178 if partition_to_process == "hour": 

179 return run_datetime_ - timedelta(hours=time_unit_look_back_) 

180 elif partition_to_process == "day": 

181 return run_datetime_ - timedelta(days=time_unit_look_back_) 

182 elif partition_to_process == "month": 

183 return datetime.combine(run_datetime_ + relativedelta(months=-time_unit_look_back_), 

184 datetime.min.time()) 

185 elif partition_to_process == "year": 

186 return datetime.combine(run_datetime_ + relativedelta(years=-time_unit_look_back_), datetime.min.time()) 

187 elif partition_to_process == "date": 

188 return run_datetime_ - timedelta(days=time_unit_look_back_) 

189 

190 except Exception as e: 

191 logging.info(f"The given partition_to_process_ {partition_to_process_} is not in the options.") 

192 

193 @staticmethod 

194 def is_file_endpoint_type(endpoint_type): 

195 """ 

196        Determines if the source is file endpoint. 

197 :param endpoint_type: Source Endpoint type. 

198 :return: boolean. 

199 """ 

200 if endpoint_type in ["s3", "sftp"]: 

201 return True 

202 return False