Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1""" 

2SDC Data Exchange Snowflake Database Endpoint module 

3""" 

4import inspect 

5import logging 

6import types 

7import pandas as pd 

8from sdc_etl_libs.database_helpers.SDCQueryHelpers import SDCQueryHelpers 

9from sdc_etl_libs.database_helpers.SnowflakeDatabase import SnowflakeDatabase 

10from sdc_etl_libs.sdc_data_exchange.SDCDataExchangeEndpoint import SDCDataExchangeEndpoint 

11from sdc_etl_libs.sdc_data_exchange_loggers.SDCLoggerFactory import SDCLoggerFactory 

12from sdc_etl_libs.sdc_data_exchange_loggers.SDCSnowflakeLogger import SDCSnowflakeLoggerEnums as LoggerEnums 

13from sdc_etl_libs.sdc_dataframe.Dataframe import Dataframe 

14from sdc_etl_libs.sdc_dataframe.SDCDataframeEnums import SDCDFTypes 

15from sdc_etl_libs.sdc_file_helpers.SDCFile import SDCFile 

16from sdc_etl_libs.sdc_file_helpers.SDCFileFactory import SDCFileFactory 

17from sdc_etl_libs.sdc_data_validation.data_validation import SDCDataValidation 

18 

19 

20class DataframeForDBEmpty(Exception): 

21 

22 def __init__(self, value): 

23 Exception.__init__(self, value) 

24 

25 

26class DataframeFailedLoading(Exception): 

27 

28 def __init__(self, value): 

29 Exception.__init__(self, value) 

30 

31 

32class SDCSnowflakeEndpoint(SnowflakeDatabase, SDCDataExchangeEndpoint): 

33 

34 def __init__(self): 

35 """ 

36 Snowflake endpoint constructor. 

37 """ 

38 

39 super().__init__() 

40 self.endpoint_type = None 

41 self.exchange_type = "snowflake" 

42 self.endpoint_tag = None 

43 self.data_schema = None 

44 self.endpoint_schema = None 

45 self.access_key = None 

46 self.secret_key = None 

47 self.table_name = None 

48 self.database_schema = None 

49 self.database = None 

50 self.username = None 

51 self.password = None 

52 self.account = None 

53 self.warehouse = None 

54 self.role = None 

55 self.files = [] 

56 self.query = None 

57 self.upsert = None 

58 self.dedupe = None 

59 self.write_data_name_to_table = None 

60 self.bookmark_filenames = None 

61 self.sqlalchemy = True 

62 self.query_name = None 

63 self.query_params = [] 

64 self.logger = None 

65 self.validator = None 

66 self.endpoint_uuid = None 

67 self.empty_is_success = None 

68 self.minutes_until_abandoned = None 

69 self.use_string_delimiter_ = False 

70 

71 def create_exchange_endpoint(self, data_schema_, endpoint_schema_, **kwargs): 

72 """ 

73 Creates a data exchange endpoint for Snowflake. Establishes connection 

74 to Snowflake. If endpoint is a sink, a list of files already loaded 

75 to table is set to self.files. If endpoint is a source, a source query 

76 is set to self.query 

77 :param data_schema_: Dict. Entire JSON data schema. 

78 :param endpoint_schema_: Dict. JSON data schema of endpoint. 

79 :return: None. 

80 """ 

81 

82 # TODO: This will have to be updated to account for the new schema key layout 

83 self.endpoint_schema = endpoint_schema_ 

84 self.data_schema = data_schema_ 

85 self.endpoint_tag = self.endpoint_schema["tag"] 

86 self.endpoint_type = self.endpoint_schema["type"] 

87 

88 self.database_schema = self.endpoint_schema["info"]["access"]["schema"] 

89 self.account = self.endpoint_schema["info"]["access"]["account"] 

90 self.database = self.endpoint_schema["info"]["access"]["database"] 

91 

92 if self.endpoint_schema["type"] == "sink": 

93 self.table_name = self.endpoint_schema["info"]["access"]["table_name"] 

94 self.upsert = self.endpoint_schema["info"]["opts"]["upsert"] 

95 self.dedupe = self.endpoint_schema["info"]["opts"]["dedupe"] 

96 self.write_data_name_to_table = self.endpoint_schema["info"]["opts"].get("write_filename_to_db", False) 

97 self.bookmark_filenames = self.endpoint_schema["info"]["opts"].get("bookmark_filenames", False) 

98 self.use_string_delimiter_ = self.endpoint_schema["info"]["opts"].get("use_string_delimiter", False) 

99 

100 self.__set_credentials(self.grab_credentials()) 

101 

102 self.connect( 

103 user_=self.username, 

104 password_=self.password, 

105 account_=self.account, 

106 warehouse_=self.warehouse, 

107 role_=self.role, 

108 database_=self.database, 

109 schema_=self.database_schema) 

110 

111 # Setup validation if it's pass in data schema (Placeholder) 

112 if "validation" in self.endpoint_schema["info"]: 

113 if self.endpoint_schema["info"]["validation"]: 

114 self.validator = SDCDataValidation(data_schema_=self.data_schema, endpoint_schema_=self.endpoint_schema) 

115 

116 # Setup logger if it's passed in data schema 

117 if "logger" in self.endpoint_schema["info"]: 

118 if self.endpoint_schema["info"]["logger"]: 

119 self.logger = SDCLoggerFactory.get_logger(logging_info_=self.endpoint_schema["info"]["logger"]) 

120 self.endpoint_uuid = self.logger.generate_endpoint_uuid(self.endpoint_schema, True) 

121 self.empty_is_success = self.endpoint_schema["info"]["logger"]["opts"]["empty_is_success"] 

122 self.minutes_until_abandoned = self.endpoint_schema["info"]["logger"]["opts"]["minutes_until_abandoned"] 

123 

124 # Generate list of files posted to Snowflake. If logger is used, also file that failed syncing 

125 if self.endpoint_type == 'sink': 

126 self.endpoint_items = {"source": [], "sink_successful": [], "sink_failed": [], "sink_processing": []} 

127 if self.logger: 

128 self.endpoint_items["sink_successful"] = \ 

129 self.logger.retrieve_successful_runs(self.endpoint_uuid, self.empty_is_success) 

130 self.endpoint_items["sink_failed"] = \ 

131 self.logger.retrieve_failed_runs(self.endpoint_uuid, True, self.minutes_until_abandoned) 

132 self.endpoint_items["sink_processing"] = \ 

133 self.logger.retrieve_non_abandoned_runs(self.endpoint_uuid, self.minutes_until_abandoned) 

134 # TODO: Deprecate once all jobs are using SDCSnowflakeLogger: 

135 elif self.bookmark_filenames: 

136 if self.endpoint_type == 'sink': 

137 self.endpoint_items["sink_successful"] = self.get_file_bookmarks( 

138 database_=self.database, schema_=self.database_schema, for_table_=self.table_name) 

139 elif not self.bookmark_filenames: 

140 self.endpoint_items["sink_successful"] = [] 

141 # TODO: Deprecated ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 

142 else: 

143 raise Exception("A logger is required to run Snowflake as a sink with the data exchange.") 

144 

145 elif self.endpoint_type == 'source': 

146 try: 

147 self.query = SDCQueryHelpers.get_query_from_schema(self.endpoint_schema, kwargs=kwargs) 

148 except Exception: 

149 raise Exception("Error reading sql file ") 

150 

151 def __set_credentials(self, creds_: dict = None): 

152 """ 

153 Sets credential values to the following class attributes (if values exist): 

154 - self.username: Snowflake username 

155 - self.password: Snowflake password 

156 - self.warehouse: Snowflake warehouse 

157 - self.role: Snowflake role 

158 :creds_: Dict. Key/value pairs of secrets. Default = None. 

159 :return: None. 

160 """ 

161 

162 if creds_: 

163 self.username = creds_.get("username") 

164 self.password = creds_.get("password") 

165 self.warehouse = creds_.get("warehouse") or self.endpoint_schema["info"]["access"].get("warehouse") 

166 self.role = creds_.get("role") or self.endpoint_schema["info"]["access"].get("role") 

167 

168 def get_data(self): 

169 # TODO: Need to revisit once we integrate spark. 

170 logging.info("Getting data from snowflake table") 

171 try: 

172 df = pd.read_sql(self.query, self.connection) 

173 except Exception as e: 

174 logging.exception(e) 

175 raise Exception("Error loading Snowflake data into Pandas occured while executing query") 

176 sdc_df = Dataframe(SDCDFTypes.PANDAS, self.data_schema) 

177 sdc_df.process_df(df) 

178 return sdc_df 

179 

180 def write_data(self, data_, file_name_=None, force_merge_=False, item_uuid_=None): 

181 """ 

182 Write out a SDCDataframe or SDCFile object to a Snowflake table. 

183 :param file_name_: Name of file. 

184 :param data_: Data to be written to SFTP. Can be SDCDataframe object or 

185 SDCFile object. 

186 :param force_merge_: Boolean. If True, a merge with be performed (regardless 

187 if upsert==False in schema) using all columns in the dataframe - 

188 Except for the ETL insert datetime column - as merge keys. 

189 :param item_uuid_: String. Data Item UUID - for use with Logger. 

190 :return: Log results of writing data to Snowflake. 

191 """ 

192 

193 df = None 

194 result = None 

195 

196 # Handle data that is a generator of SDCDataframes 

197 if isinstance(data_, types.GeneratorType): 

198 try: 

199 df = next(data_) 

200 except Exception as e: 

201 logging.exception(f"{file_name_} failed loading to SDCDataframe via {type(data_)}.") 

202 

203 # Handle data that is a file-like object 

204 elif SDCFileFactory.is_type_sdcfile(data_): 

205 try: 

206 df = data_.get_file_as_dataframe() 

207 except Exception as e: 

208 logging.exception(f"{file_name_} failed loading to SDCDataframe via {type(data_)}.") 

209 

210 # Handle if data is SDCFile. 

211 elif isinstance(data_, SDCFile): 

212 raise Exception(f"Cannot write an SDCFile to Snowflake. Data must go through SDCDataframe process.") 

213 

214 # Handle data that is already in a SDCDataframe 

215 elif isinstance(data_, Dataframe): 

216 df = data_ 

217 

218 else: 

219 raise Exception("SDC Snowflake received an unknown data type.") 

220 

221 if df.df is None or len(df.df) == 0: 

222 raise DataframeForDBEmpty(f"{file_name_} was empty.") 

223 

224 if self.write_data_name_to_table: 

225 column_name = LoggerEnums.RawTableColumns.ETL_DATA_ITEM_COLUMN.value if self.logger else '_ETL_FILENAME' 

226 if file_name_: 

227 if file_name_ != 'block': 

228 df.fill_in_column(column_name_=column_name, column_value_=file_name_, create_column_=True) 

229 else: 

230 raise Exception("No file name was given to write to the Snowflake table.") 

231 

232 style = self.endpoint_schema["info"]["opts"].get('style', 'snowflake') 

233 logging.info('Applying formatting style "%s" to the fields', style) 

234 df.cleanse_column_names(style) 

235 

236 try: 

237 # If logger is passed in here, then self.logger.write_stats will be generated 

238 result = df.write_dataframe_to_database( 

239 sdc_database_handle_=self, 

240 table_name_=self.table_name, 

241 schema_name_=self.database_schema, 

242 upsert_=self.upsert, 

243 dedupe_=self.dedupe, 

244 logger_=self.logger, 

245 force_merge_=force_merge_, 

246 item_uuid_=item_uuid_, 

247 use_string_delimiter_=self.use_string_delimiter_ 

248 ) 

249 

250 # TODO: Deprecate once all jobs are using SDCSnowflakeLogger: 

251 if self.bookmark_filenames: 

252 if file_name_: 

253 try: 

254 self.update_file_bookmarks( 

255 database_=self.database, 

256 schema_=self.database_schema, 

257 for_table_=self.table_name.upper(), 

258 file_name_=file_name_) 

259 except Exception as e: 

260 raise Exception("Error updating bookmarking table.") 

261 else: 

262 raise Exception("No file name was given to write to the Snowflake table.") 

263 # TODO: Deprecated ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 

264 

265 except Exception as e: 

266 raise Exception(f"Error loading data into Snowflake.\n{e}") 

267 

268 return result 

269 

270 def delete_data(self): 

271 """ 

272 Deletes data from a Snowflake endpoint if this method's call can be  

273 traced back to the EndpointIntegrationTest class. Used to delete 

274 Snowflake integration test sink data before data exchange occurs. 

275 """ 

276 stack = inspect.stack() 

277 for i,v in enumerate(stack): 

278 stack_line = stack[i].code_context[0] 

279 if 'EndpointIntegrationTest' in stack_line: 

280 self.truncate_table(self.database, self.database_schema, self.table_name) 

281 break 

282 else: 

283 raise(Exception(""" 

284 The SDCSnowflakeEndpoint.delete_data() method may not be  

285 used unless it's call can be traced back to  

286 EndpointIntegrationTest class.  

287 """)) 

288 

289 def get_records_in_table_by_data_item(self, data_item_): 

290 """ 

291 Queries the sink table for a data time via the ETL_DATA_ITEM_COLUMN column and records a count of records. 

292 

293 :param data_item_: String. Name of data item. 

294 :return: Int. Count of records for data item. 

295 """ 

296 

297 select_query = f""" 

298 SELECT 

299 COUNT(*) as "TOTAL_RECORDS_WRITTEN" 

300 

301 FROM "{self.database}"."{self.database_schema}"."{self.table_name}" 

302 WHERE "{LoggerEnums.RawTableColumns.ETL_DATA_ITEM_COLUMN.value}" = '{data_item_}' 

303 """ 

304 

305 self.execute_query(select_query, return_results_=True) 

306 

307 for row in self.get_results(): 

308 total_records_loaded = row[0] 

309 

310 return int(total_records_loaded)