Coverage for libs/sdc_etl_libs/sdc_exchange_endpoints/database_endpoints/SDCSnowflakeEndpoint.py : 34%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1"""
2SDC Data Exchange Snowflake Database Endpoint module
3"""
4import inspect
5import logging
6import types
7import pandas as pd
8from sdc_etl_libs.database_helpers.SDCQueryHelpers import SDCQueryHelpers
9from sdc_etl_libs.database_helpers.SnowflakeDatabase import SnowflakeDatabase
10from sdc_etl_libs.sdc_data_exchange.SDCDataExchangeEndpoint import SDCDataExchangeEndpoint
11from sdc_etl_libs.sdc_data_exchange_loggers.SDCLoggerFactory import SDCLoggerFactory
12from sdc_etl_libs.sdc_data_exchange_loggers.SDCSnowflakeLogger import SDCSnowflakeLoggerEnums as LoggerEnums
13from sdc_etl_libs.sdc_dataframe.Dataframe import Dataframe
14from sdc_etl_libs.sdc_dataframe.SDCDataframeEnums import SDCDFTypes
15from sdc_etl_libs.sdc_file_helpers.SDCFile import SDCFile
16from sdc_etl_libs.sdc_file_helpers.SDCFileFactory import SDCFileFactory
17from sdc_etl_libs.sdc_data_validation.data_validation import SDCDataValidation
20class DataframeForDBEmpty(Exception):
22 def __init__(self, value):
23 Exception.__init__(self, value)
26class DataframeFailedLoading(Exception):
28 def __init__(self, value):
29 Exception.__init__(self, value)
32class SDCSnowflakeEndpoint(SnowflakeDatabase, SDCDataExchangeEndpoint):
34 def __init__(self):
35 """
36 Snowflake endpoint constructor.
37 """
39 super().__init__()
40 self.endpoint_type = None
41 self.exchange_type = "snowflake"
42 self.endpoint_tag = None
43 self.data_schema = None
44 self.endpoint_schema = None
45 self.access_key = None
46 self.secret_key = None
47 self.table_name = None
48 self.database_schema = None
49 self.database = None
50 self.username = None
51 self.password = None
52 self.account = None
53 self.warehouse = None
54 self.role = None
55 self.files = []
56 self.query = None
57 self.upsert = None
58 self.dedupe = None
59 self.write_data_name_to_table = None
60 self.bookmark_filenames = None
61 self.sqlalchemy = True
62 self.query_name = None
63 self.query_params = []
64 self.logger = None
65 self.validator = None
66 self.endpoint_uuid = None
67 self.empty_is_success = None
68 self.minutes_until_abandoned = None
69 self.use_string_delimiter_ = False
71 def create_exchange_endpoint(self, data_schema_, endpoint_schema_, **kwargs):
72 """
73 Creates a data exchange endpoint for Snowflake. Establishes connection
74 to Snowflake. If endpoint is a sink, a list of files already loaded
75 to table is set to self.files. If endpoint is a source, a source query
76 is set to self.query
77 :param data_schema_: Dict. Entire JSON data schema.
78 :param endpoint_schema_: Dict. JSON data schema of endpoint.
79 :return: None.
80 """
82 # TODO: This will have to be updated to account for the new schema key layout
83 self.endpoint_schema = endpoint_schema_
84 self.data_schema = data_schema_
85 self.endpoint_tag = self.endpoint_schema["tag"]
86 self.endpoint_type = self.endpoint_schema["type"]
88 self.database_schema = self.endpoint_schema["info"]["access"]["schema"]
89 self.account = self.endpoint_schema["info"]["access"]["account"]
90 self.database = self.endpoint_schema["info"]["access"]["database"]
92 if self.endpoint_schema["type"] == "sink":
93 self.table_name = self.endpoint_schema["info"]["access"]["table_name"]
94 self.upsert = self.endpoint_schema["info"]["opts"]["upsert"]
95 self.dedupe = self.endpoint_schema["info"]["opts"]["dedupe"]
96 self.write_data_name_to_table = self.endpoint_schema["info"]["opts"].get("write_filename_to_db", False)
97 self.bookmark_filenames = self.endpoint_schema["info"]["opts"].get("bookmark_filenames", False)
98 self.use_string_delimiter_ = self.endpoint_schema["info"]["opts"].get("use_string_delimiter", False)
100 self.__set_credentials(self.grab_credentials())
102 self.connect(
103 user_=self.username,
104 password_=self.password,
105 account_=self.account,
106 warehouse_=self.warehouse,
107 role_=self.role,
108 database_=self.database,
109 schema_=self.database_schema)
111 # Setup validation if it's pass in data schema (Placeholder)
112 if "validation" in self.endpoint_schema["info"]:
113 if self.endpoint_schema["info"]["validation"]:
114 self.validator = SDCDataValidation(data_schema_=self.data_schema, endpoint_schema_=self.endpoint_schema)
116 # Setup logger if it's passed in data schema
117 if "logger" in self.endpoint_schema["info"]:
118 if self.endpoint_schema["info"]["logger"]:
119 self.logger = SDCLoggerFactory.get_logger(logging_info_=self.endpoint_schema["info"]["logger"])
120 self.endpoint_uuid = self.logger.generate_endpoint_uuid(self.endpoint_schema, True)
121 self.empty_is_success = self.endpoint_schema["info"]["logger"]["opts"]["empty_is_success"]
122 self.minutes_until_abandoned = self.endpoint_schema["info"]["logger"]["opts"]["minutes_until_abandoned"]
124 # Generate list of files posted to Snowflake. If logger is used, also file that failed syncing
125 if self.endpoint_type == 'sink':
126 self.endpoint_items = {"source": [], "sink_successful": [], "sink_failed": [], "sink_processing": []}
127 if self.logger:
128 self.endpoint_items["sink_successful"] = \
129 self.logger.retrieve_successful_runs(self.endpoint_uuid, self.empty_is_success)
130 self.endpoint_items["sink_failed"] = \
131 self.logger.retrieve_failed_runs(self.endpoint_uuid, True, self.minutes_until_abandoned)
132 self.endpoint_items["sink_processing"] = \
133 self.logger.retrieve_non_abandoned_runs(self.endpoint_uuid, self.minutes_until_abandoned)
134 # TODO: Deprecate once all jobs are using SDCSnowflakeLogger:
135 elif self.bookmark_filenames:
136 if self.endpoint_type == 'sink':
137 self.endpoint_items["sink_successful"] = self.get_file_bookmarks(
138 database_=self.database, schema_=self.database_schema, for_table_=self.table_name)
139 elif not self.bookmark_filenames:
140 self.endpoint_items["sink_successful"] = []
141 # TODO: Deprecated ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
142 else:
143 raise Exception("A logger is required to run Snowflake as a sink with the data exchange.")
145 elif self.endpoint_type == 'source':
146 try:
147 self.query = SDCQueryHelpers.get_query_from_schema(self.endpoint_schema, kwargs=kwargs)
148 except Exception:
149 raise Exception("Error reading sql file ")
151 def __set_credentials(self, creds_: dict = None):
152 """
153 Sets credential values to the following class attributes (if values exist):
154 - self.username: Snowflake username
155 - self.password: Snowflake password
156 - self.warehouse: Snowflake warehouse
157 - self.role: Snowflake role
158 :creds_: Dict. Key/value pairs of secrets. Default = None.
159 :return: None.
160 """
162 if creds_:
163 self.username = creds_.get("username")
164 self.password = creds_.get("password")
165 self.warehouse = creds_.get("warehouse") or self.endpoint_schema["info"]["access"].get("warehouse")
166 self.role = creds_.get("role") or self.endpoint_schema["info"]["access"].get("role")
168 def get_data(self):
169 # TODO: Need to revisit once we integrate spark.
170 logging.info("Getting data from snowflake table")
171 try:
172 df = pd.read_sql(self.query, self.connection)
173 except Exception as e:
174 logging.exception(e)
175 raise Exception("Error loading Snowflake data into Pandas occured while executing query")
176 sdc_df = Dataframe(SDCDFTypes.PANDAS, self.data_schema)
177 sdc_df.process_df(df)
178 return sdc_df
180 def write_data(self, data_, file_name_=None, force_merge_=False, item_uuid_=None):
181 """
182 Write out a SDCDataframe or SDCFile object to a Snowflake table.
183 :param file_name_: Name of file.
184 :param data_: Data to be written to SFTP. Can be SDCDataframe object or
185 SDCFile object.
186 :param force_merge_: Boolean. If True, a merge with be performed (regardless
187 if upsert==False in schema) using all columns in the dataframe -
188 Except for the ETL insert datetime column - as merge keys.
189 :param item_uuid_: String. Data Item UUID - for use with Logger.
190 :return: Log results of writing data to Snowflake.
191 """
193 df = None
194 result = None
196 # Handle data that is a generator of SDCDataframes
197 if isinstance(data_, types.GeneratorType):
198 try:
199 df = next(data_)
200 except Exception as e:
201 logging.exception(f"{file_name_} failed loading to SDCDataframe via {type(data_)}.")
203 # Handle data that is a file-like object
204 elif SDCFileFactory.is_type_sdcfile(data_):
205 try:
206 df = data_.get_file_as_dataframe()
207 except Exception as e:
208 logging.exception(f"{file_name_} failed loading to SDCDataframe via {type(data_)}.")
210 # Handle if data is SDCFile.
211 elif isinstance(data_, SDCFile):
212 raise Exception(f"Cannot write an SDCFile to Snowflake. Data must go through SDCDataframe process.")
214 # Handle data that is already in a SDCDataframe
215 elif isinstance(data_, Dataframe):
216 df = data_
218 else:
219 raise Exception("SDC Snowflake received an unknown data type.")
221 if df.df is None or len(df.df) == 0:
222 raise DataframeForDBEmpty(f"{file_name_} was empty.")
224 if self.write_data_name_to_table:
225 column_name = LoggerEnums.RawTableColumns.ETL_DATA_ITEM_COLUMN.value if self.logger else '_ETL_FILENAME'
226 if file_name_:
227 if file_name_ != 'block':
228 df.fill_in_column(column_name_=column_name, column_value_=file_name_, create_column_=True)
229 else:
230 raise Exception("No file name was given to write to the Snowflake table.")
232 style = self.endpoint_schema["info"]["opts"].get('style', 'snowflake')
233 logging.info('Applying formatting style "%s" to the fields', style)
234 df.cleanse_column_names(style)
236 try:
237 # If logger is passed in here, then self.logger.write_stats will be generated
238 result = df.write_dataframe_to_database(
239 sdc_database_handle_=self,
240 table_name_=self.table_name,
241 schema_name_=self.database_schema,
242 upsert_=self.upsert,
243 dedupe_=self.dedupe,
244 logger_=self.logger,
245 force_merge_=force_merge_,
246 item_uuid_=item_uuid_,
247 use_string_delimiter_=self.use_string_delimiter_
248 )
250 # TODO: Deprecate once all jobs are using SDCSnowflakeLogger:
251 if self.bookmark_filenames:
252 if file_name_:
253 try:
254 self.update_file_bookmarks(
255 database_=self.database,
256 schema_=self.database_schema,
257 for_table_=self.table_name.upper(),
258 file_name_=file_name_)
259 except Exception as e:
260 raise Exception("Error updating bookmarking table.")
261 else:
262 raise Exception("No file name was given to write to the Snowflake table.")
263 # TODO: Deprecated ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
265 except Exception as e:
266 raise Exception(f"Error loading data into Snowflake.\n{e}")
268 return result
270 def delete_data(self):
271 """
272 Deletes data from a Snowflake endpoint if this method's call can be
273 traced back to the EndpointIntegrationTest class. Used to delete
274 Snowflake integration test sink data before data exchange occurs.
275 """
276 stack = inspect.stack()
277 for i,v in enumerate(stack):
278 stack_line = stack[i].code_context[0]
279 if 'EndpointIntegrationTest' in stack_line:
280 self.truncate_table(self.database, self.database_schema, self.table_name)
281 break
282 else:
283 raise(Exception("""
284 The SDCSnowflakeEndpoint.delete_data() method may not be
285 used unless it's call can be traced back to
286 EndpointIntegrationTest class.
287 """))
289 def get_records_in_table_by_data_item(self, data_item_):
290 """
291 Queries the sink table for a data time via the ETL_DATA_ITEM_COLUMN column and records a count of records.
293 :param data_item_: String. Name of data item.
294 :return: Int. Count of records for data item.
295 """
297 select_query = f"""
298 SELECT
299 COUNT(*) as "TOTAL_RECORDS_WRITTEN"
301 FROM "{self.database}"."{self.database_schema}"."{self.table_name}"
302 WHERE "{LoggerEnums.RawTableColumns.ETL_DATA_ITEM_COLUMN.value}" = '{data_item_}'
303 """
305 self.execute_query(select_query, return_results_=True)
307 for row in self.get_results():
308 total_records_loaded = row[0]
310 return int(total_records_loaded)