Coverage for libs/sdc_etl_libs/sdc_data_exchange/SDCDataExchange.py : 18%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1import logging
2import math
3import os
4import sys
5import traceback
6import uuid
8from sdc_etl_libs.sdc_data_exchange.SDCDataExchangeEndpointFactory import \
9 SDCDataExchangeEndpointFactory
10from sdc_etl_libs.sdc_data_exchange.SDCDataExchangeEnums import (
11 APIExchangeTypes, APIResultTypes, DatabaseExchangeTypes, DataExchangeTypes,
12 FileExchangeTypes, FileResultTypes, NoSqlDatabaseExchangeTypes, MockExchangeTypes, ExchangeResultTypes)
13from sdc_etl_libs.sdc_data_exchange_loggers.SDCSnowflakeLogger import \
14 SDCSnowflakeLoggerEnums as LoggerEnums
15from sdc_etl_libs.sdc_data_schema.schema_toolbox import SchemaToolbox
16from sdc_etl_libs.sdc_data_schema.schema_validation import SchemaValidation
17from sdc_etl_libs.sdc_dataframe.Dataframe import Dataframe
18from sdc_etl_libs.sdc_dataframe.SDCDataframeEnums import SDCDFTypes
19from sdc_etl_libs.sdc_exchange_endpoints.database_endpoints.SDCSnowflakeEndpoint import DataframeForDBEmpty
20from sdc_etl_libs.sdc_file_helpers.SDCFileFactory import SDCFileFactory
21from sdc_etl_libs.sdc_file_helpers.SDCFileHelpers import SDCFileHelpers
22from sdc_etl_libs.sdc_filetransfer.utils.delete_files_mechanism.FilenameMatchDeleteCriteria import \
23 FilenameMatchDeleteCriteria
24from sdc_etl_libs.sdc_dataframe.sdc_dataframe_helpers import SDCDataframeHelpers
27class SDCDataExchange:
29 # TODO: This will have to be updated to account for the new schema key layout
31 def __init__(self, schema_name_, source_endpoint_tag_, sink_endpoint_tag_, **kwargs):
32 """
33 SDCDataExchange constructor. Creates source and sink instances and
34 compare data lists between endpoints (if applicable to types).
35 :param schema_name_: Name of data schema.
36 :param source_endpoint_tag_: Tag of source endpoint (from data schema).
37 :param sink_endpoint_tag_: Tag of source endpoint (from data schema).
38 """
40 self.source_endpoint_schema = None
41 self.source_endpoint_uuid = None
42 self.source = None
43 self.sinks = []
44 self.data_schema = None
46 self.source_endpoint_tag = source_endpoint_tag_
47 self.sink_endpoint_tag = sink_endpoint_tag_
48 self.etl_results_log = {}
49 self.sink_items_to_process = {}
51 # Setup Source and grab Source information
53 self.get_data_schema(schema_name_)
55 schema_validation = SchemaValidation()
56 self.data_schema = schema_validation.validate_schema(self.data_schema)
57 self.source_endpoint_schema = SchemaToolbox.get_endpoint_data_from_schema(self.data_schema,
58 self.source_endpoint_tag)
60 self.dataframe_type = SchemaToolbox.determine_dataframe_type_from_schema(self.data_schema)
62 self.source = SDCDataExchangeEndpointFactory.get_endpoint(self.data_schema,
63 self.source_endpoint_schema,
64 **kwargs)
65 self.sink_setup(**kwargs)
67 self.process_by_block = False
69 if SDCFileHelpers.is_file_endpoint_type(self.source_endpoint_schema["info"]["type"]) and \
70 self.source.endpoint_items["source"] is not None:
71 distinct_sink_items = set()
72 for sink in self.sinks:
73 distinct_sink_items.update(self.sink_items_to_process[sink.endpoint_tag])
74 self.sink_items_to_process[self.source.endpoint_tag] = distinct_sink_items
75 self.process_by_block = self.source_endpoint_schema["info"]["opts"]["process_by_block"]
77 def get_data_schema(self, schema_name_):
78 """
79 Retrieves SDC data schema from give schema name and puts in self.data_schema.
80 :param schema_name_: String. Name of data schema.
81 :return: None.
82 """
84 schema_registry_api_url = os.getenv('schema_registry_api_url')
85 namespace = schema_name_.split('/')[0]
86 name = schema_name_.split('/')[1]
87 self.data_schema = SchemaToolbox.get_schema(
88 schema_registry_api_url_=schema_registry_api_url,
89 namespace_=namespace,
90 name_=name,
91 local_schema_path_=schema_name_)
93 def sink_setup(self, **kwargs):
94 """
95 Sets up the sinks from the from list of sink tags passed in, including:
96 - Validating sink schema.
97 - Setting sink as it's endpoint class.
98 - Determining what files need to synced from source.
99 :return: None.
100 """
102 # One or more sinks can be passed. Ensure sink_endpoint_tag is a list.
103 if isinstance(self.sink_endpoint_tag, str):
104 temp_list = []
105 temp_list.append(self.sink_endpoint_tag)
106 self.sink_endpoint_tag = temp_list.copy()
108 for tag in self.sink_endpoint_tag:
110 ep_schema = SchemaToolbox.get_endpoint_data_from_schema(self.data_schema, tag)
112 # Get the endpoint class for the sink.
113 sink = SDCDataExchangeEndpointFactory.get_endpoint(self.data_schema, ep_schema, **kwargs)
115 # Optionally delete existing data in sink. This should be done
116 # before SDCDataExchange.exchange_data() is run.
117 # As of 12/9/2020, this functionality is used exclusively by
118 # EndpointIntegrationTest.test_data_exchange())
119 if 'delete_sink_data' in kwargs and kwargs['delete_sink_data']:
120 delete_criteria = None
122 # Execution varies by sink exchange type
123 if sink.exchange_type in FileExchangeTypes.__members__:
124 delete_criteria = [FilenameMatchDeleteCriteria(file_name=sink.file_name)]
125 delete_msg = f"Attempting to delete '{tag}' files with the filename '{sink.file_name}' if any..."
126 elif sink.exchange_type in DatabaseExchangeTypes.__members__:
127 delete_msg = f"Attempting to delete '{tag}' data..."
128 else:
129 raise Exception(
130 f"The '{tag}' endpoint class does not support 'delete_sink_data' functionality at this time.")
132 logging.info(delete_msg)
134 if delete_criteria:
135 sink.delete_data(delete_criteria)
136 else:
137 sink.delete_data()
139 # If a logger is requested, setup things needed for logger.
140 if sink.logger:
141 sink = self.__logger_setup(sink, ep_schema)
142 self.source.logger = True
144 # some quick maintenance before we create new merge tables
145 if sink.exchange_type == DataExchangeTypes.snowflake.value:
146 if sink.upsert:
147 sink.clean_old_temp_tables()
149 self.sinks.append(sink)
151 #TODO figure out better logic for the below.
152 # E.g. in case of moving file from S3/csv to Dynamodb how to apply that?
153 # Determine which files need to be sent from source to sink.
154 if SDCFileHelpers.is_file_endpoint_type(self.source_endpoint_schema["info"]["type"]):
155 self.sink_items_to_process[sink.endpoint_tag] = \
156 None if self.source.endpoint_items["source"] is None \
157 else SDCDataExchange.compare_item_lists(self.source, sink)
159 def __logger_setup(self, sink_, data_schema_):
160 """
161 Setups up needed functionality for a sink logger.
162 :param sink_: An SDCDataExchange sink endpoint object.
163 :param data_schema_: Endpoint schema.
164 :return: An SDCDataExchange sink endpoint object.
165 """
167 # If logger is used for any sink. need to set a UUID for the source.
168 self.source.endpoint_uuid = sink_.logger.generate_endpoint_uuid(self.source_endpoint_schema, True)
170 # Setup the base values for the logger insert message. DATA_ITEM_NAME and MESSAGE will be updated when used.
171 sink_.logger_insert_message = {
172 "SCHEMA_NAMESPACE": self.data_schema["namespace"],
173 "SCHEMA_NAME": self.data_schema["name"],
174 "DATA_SOURCE_TAG": self.source_endpoint_schema["tag"].upper(),
175 "DATA_SOURCE_TYPE": self.source_endpoint_schema["info"]["type"].upper(),
176 "DATA_SOURCE_ENDPOINT_UUID": self.source.endpoint_uuid,
177 "DATA_SINK_TAG": sink_.endpoint_schema["tag"],
178 "DATA_SINK_TYPE": sink_.endpoint_schema["info"]["type"].upper(),
179 "DATA_SINK_ENDPOINT_UUID": sink_.endpoint_uuid,
180 "DATA_ITEM_NAME": None,
181 "EXCHANGE_MESSAGE": None
182 }
184 return sink_
186 @staticmethod
187 def compare_item_lists(source_, sink_):
188 """
189 Generates a list of data items in the source but not in the sink.
190 :param source_: Endpoint object. An SDCDataExchange source endpoint object.
191 :param sink_: Endpoint object. An SDCDataExchange sink endpoint object.
192 :return: List. Data item names that need to be sycned (e.g. file names, etc.)
193 """
195 if not isinstance(source_.endpoint_items["source"], list) or \
196 not isinstance(sink_.endpoint_items["sink_successful"], list) or \
197 not isinstance(sink_.endpoint_items["sink_processing"], list):
198 raise Exception("Files for source and sink must be passed as Lists.")
200 source_data = set(source_.endpoint_items["source"])
201 sink_successful = set(sink_.endpoint_items["sink_successful"])
202 sink_still_processing = set(sink_.endpoint_items["sink_processing"])
204 if sink_still_processing:
205 logging.info(f"These data items are considered to still be in process: {sink_still_processing}")
207 # The items that need to be processed are the ones in the source and not in the sink and less
208 # and items that the logger considers still in process.
209 data_list = list(source_data.difference(sink_successful).difference(sink_still_processing))
211 return data_list
213 def _close_connections(self):
214 """
215 close the source and sink connection
216 :return: None
217 """
218 try:
219 self.source.close_connection()
220 except Exception:
221 pass
223 for sink in self.sinks:
224 try:
225 sink.close_connection()
226 except Exception:
227 pass
229 def _get_items_by_sink(self, sink_, items_to_process_):
230 """
231 returns the list of items to be processed by sink
232 :param sink_: An SDCDataExchange sink endpoint object.
233 :param items_to_process_: an List[str] with the name of items to process
234 :return: List[str]
235 """
236 return list(item for item in filter(lambda tag: tag in self.sink_items_to_process[sink_.endpoint_tag],
237 items_to_process_))
239 def _get_df_record_count(self, sdcDataframe):
240 if isinstance(sdcDataframe, Dataframe):
241 sdcDataframe.shape = sdcDataframe.df.shape
242 return sdcDataframe.shape[0]
243 else:
244 return 0
246 def exchange_data(self):
247 """
248 Exchanges data between a source and a one or more sinks.
249 :return: Dict. Results log of information for each item's exchange attempt.
250 """
251 self.etl_results_log = {}
253 def generate_data_item_uuid(data_source_endpoint_uuid_, data_sink_endpoint_uuid_, data_item_name_):
254 item_uuid = sink.logger.generate_uuid({
255 "DATA_SOURCE_ENDPOINT_UUID": data_source_endpoint_uuid_,
256 "DATA_SINK_ENDPOINT_UUID": data_sink_endpoint_uuid_,
257 "DATA_ITEM_NAME": data_item_name_
258 })
259 return item_uuid
261 def log_new_data_item(item_to_process_):
262 for sink in self.sinks:
263 if item_to_process_ in self.sink_items_to_process[sink.endpoint_tag]:
264 if sink.logger:
265 sink.logger_insert_message["DATA_ITEM_NAME"] = item_to_process_
266 sink.logger.add_item_to_base_table({
267 "SCHEMA_NAMESPACE": self.data_schema["namespace"],
268 "SCHEMA_NAME": self.data_schema["name"],
269 "DATA_SOURCE_TAG": self.source_endpoint_schema["tag"].upper(),
270 "DATA_SOURCE_TYPE": self.source_endpoint_schema["info"]["type"].upper(),
271 "DATA_SOURCE_ENDPOINT_UUID": self.source.endpoint_uuid,
272 "DATA_SINK_TAG": sink.endpoint_schema["tag"],
273 "DATA_SINK_TYPE": sink.endpoint_schema["info"]["type"].upper(),
274 "DATA_SINK_ENDPOINT_UUID": sink.endpoint_uuid,
275 "DATA_ITEM_NAME": item_to_process_,
276 "EXCHANGE_MESSAGE": "Exchange started."
277 })
279 def log_source_record_count(item_to_process_, source_record_count_):
280 for sink in self.sinks:
281 if item_to_process_ in self.sink_items_to_process[sink.endpoint_tag]:
282 if sink.logger:
283 item_uuid = generate_data_item_uuid(self.source.endpoint_uuid, sink.endpoint_uuid,
284 item_to_process_)
285 sink.logger.update_base_table({
286 "UUID": item_uuid,
287 "TOTAL_RECORDS_WRITTEN": 0,
288 "TOTAL_RECORDS_FROM_SOURCE": source_record_count_,
289 "CURRENT_EXCHANGE_STATUS": LoggerEnums.ExchangeStatuses.PROCESSING.value,
290 "CURRENT_EXCHANGE_REASON": LoggerEnums.ExchangeStatusReasons.SOURCING.value,
291 "EXCHANGE_MESSAGE": f"Source record count is {source_record_count_:,}."
292 })
294 def log_empty_result(item_to_process_):
295 msg_empty = f"{FileResultTypes.empty.value}: {item_to_process_} contains no data and was skipped."
296 logging.error(f"{msg_empty}")
297 for sink in self.sinks:
298 if item_to_process_ in self.sink_items_to_process[sink.endpoint_tag]:
299 self.etl_results_log[sink.endpoint_tag].append(msg_empty)
300 if sink.logger:
301 item_uuid = generate_data_item_uuid(self.source.endpoint_uuid, sink.endpoint_uuid,
302 item_to_process_)
303 sink.logger.update_base_table({
304 "UUID": item_uuid,
305 "TOTAL_RECORDS_WRITTEN": 0,
306 "CURRENT_EXCHANGE_STATUS": LoggerEnums.ExchangeStatuses.EMPTY.value,
307 "CURRENT_EXCHANGE_REASON": LoggerEnums.ExchangeStatusReasons.NONE.value,
308 "EXCHANGE_MESSAGE": msg_empty
309 })
311 def log_dataframe_record_count(item_to_process_, df_record_count):
312 for sink in self.sinks:
313 if item_to_process_ in self.sink_items_to_process[sink.endpoint_tag]:
314 if sink.logger:
315 item_uuid = generate_data_item_uuid(self.source.endpoint_uuid, sink.endpoint_uuid,
316 item_to_process_)
317 sink.logger.update_base_table({
318 "UUID": item_uuid,
319 "TOTAL_RECORDS_WRITTEN": 0,
320 "TOTAL_RECORDS_FROM_DATAFRAME": df_record_count,
321 "CURRENT_EXCHANGE_STATUS": LoggerEnums.ExchangeStatuses.PROCESSING.value,
322 "CURRENT_EXCHANGE_REASON": LoggerEnums.ExchangeStatusReasons.SOURCING.value,
323 "EXCHANGE_MESSAGE": f"Dataframe record count is {df_record_count:,}."
324 })
326 def log_failed_sourcing(item_to_process_, error_):
327 msg = f"{FileResultTypes.error.value}: Sourcing {item_to_process_} from {self.source.exchange_type} failed."
328 logging.warning(f"{msg}. {error_}")
330 for sink in self.sinks:
331 if item_to_process_ in self.sink_items_to_process[sink.endpoint_tag]:
332 self.etl_results_log[sink.endpoint_tag].append(msg)
333 if sink.logger:
334 item_uuid = generate_data_item_uuid(self.source.endpoint_uuid, sink.endpoint_uuid,
335 item_to_process_)
336 sink.logger.update_base_table({
337 "UUID": item_uuid,
338 "TOTAL_RECORDS_WRITTEN": 0,
339 "CURRENT_EXCHANGE_STATUS": LoggerEnums.ExchangeStatuses.FAILED.value,
340 "CURRENT_EXCHANGE_REASON": LoggerEnums.ExchangeStatusReasons.SOURCING.value,
341 "EXCHANGE_MESSAGE": msg
342 })
344 def exchange_data_by_file_in_blocks(num_items_to_process_):
345 """
346 Exchanges data between a source and a one or more sinks. by blocks.
347 :param num_items_to_process_: int. Total of items to process
348 :return: Dict. Results log of information for each item's exchange attempt.
349 """
350 items_processed = []
351 block_start_number = 1
352 sdc_df_to_save = Dataframe(None, self.data_schema)
354 def execute_sink_by_block(item_no_):
355 item_name = "block"
356 logging.info(
357 f"Processing {item_name}. {block_start_number} to {item_no_+1}/{num_items_to_process_} files "
358 f"with {sdc_df_to_save.shape[0]} rows")
359 execute_file_based_sinks(sdc_df_to_save, items_processed, item_name)
361 for item_no, item_to_process in enumerate(self.sink_items_to_process[self.source.endpoint_tag]):
362 try:
363 source_data, source_record_count = self.source.get_data(item_to_process)
364 new_sdc_df = source_data.get_file_as_dataframe()
366 # TODO: Remove this line when get_file_as_dataframe() method returns _ETL_FILENAME with the name
367 # of the file that we are processing instead of returning a date.
368 new_sdc_df.df["_ETL_FILENAME"] = item_to_process
370 except Exception as error:
371 log_failed_sourcing(item_to_process, error)
372 traceback.print_exc(file=sys.stdout)
373 continue
375 sdc_df_to_save = SDCDataframeHelpers.concat_sdc_dataframe(self.data_schema, sdc_df_to_save, new_sdc_df)
376 items_processed.append((item_to_process, source_record_count or 0))
377 if sdc_df_to_save.shape[0] >= self.source_endpoint_schema["info"]["opts"]["rows_by_block"]:
378 execute_sink_by_block(item_no)
379 items_processed = []
380 block_start_number = item_no + 2
381 sdc_df_to_save = Dataframe(None, self.data_schema)
383 if sdc_df_to_save.shape[0] > 0:
384 execute_sink_by_block(item_no)
386 def exchange_data_by_file(num_items_to_process_):
387 """
388 Exchanges data between a source and a one or more sinks. file by file.
389 :param num_items_to_process_: int. Total of items to process
390 :return: Dict. Results log of information for each item's exchange attempt.
391 """
392 for item_no, item_to_process in enumerate(self.sink_items_to_process[self.source.endpoint_tag]):
393 logging.info(f"Processing {item_to_process} (Item {item_no + 1}/{num_items_to_process_})...")
395 log_new_data_item(item_to_process)
396 source_data = None
397 try:
398 source_data, source_record_count = self.source.get_data(item_to_process)
399 df_record_count = 0
401 if SDCFileFactory.is_type_sdcfile(source_data):
402 if source_data.compression_type != "gzip":
403 source_data = source_data.get_file_as_dataframe()
404 source_data.shape = source_data.df.shape
405 df_record_count = source_data.shape[0]
406 log_dataframe_record_count(item_to_process, df_record_count)
408 if df_record_count == 0:
409 log_empty_result(item_to_process)
410 continue
411 log_source_record_count(item_to_process, source_record_count)
412 except Exception as error:
413 log_failed_sourcing(item_to_process, error)
414 traceback.print_exc(file=sys.stdout)
415 continue
417 execute_file_based_sinks(source_data, [(item_to_process, source_record_count or 0)], item_to_process)
419 def execute_file_based_sinks(source_data_, items_to_process_, item_name_):
420 for sink in self.sinks:
421 items_to_process_by_sink = self._get_items_by_sink(sink, list(i[0] for i in items_to_process_))
422 source_record_count_by_sink = sum(i[1] for i in filter(
423 lambda tag: tag[0] in self.sink_items_to_process[sink.endpoint_tag], items_to_process_))
425 if len(items_to_process_by_sink) > 0:
427 if self.process_by_block:
428 source_data = Dataframe(None, self.data_schema)
429 df_filtered = source_data_.df.loc[source_data_.df['_ETL_FILENAME'].isin(
430 self.sink_items_to_process[sink.endpoint_tag])]
431 source_data.df = df_filtered
432 else:
433 source_data = source_data_
435 df_record_count_by_sink = self._get_df_record_count(source_data)
437 item_uuid = None
438 force_merge = False
439 if sink.logger:
440 # Set some columns to increment on update when doing iterative loads so numbers don't get overwritten
441 if sink.logger.processing_type == "by_records":
442 sink.logger.make_incremental_cols = [
443 "TOTAL_RECORDS_FROM_DATAFRAME", "TOTAL_RECORDS_TO_WRITE"
444 ]
445 item_uuid = generate_data_item_uuid(self.source.endpoint_uuid, sink.endpoint_uuid,
446 item_name_)
447 sink.logger.write_stats = {}
449 # Process for handling failed files
450 if sink.logger and sink.exchange_type in DatabaseExchangeTypes.__members__:
451 if item_name_ in sink.endpoint_items["sink_failed"]:
452 logging.info(
453 f'{item_name_} had an existing FAILED status for {sink.endpoint_schema["tag"]}. '
454 f'Checking if any data made it over...')
455 records_loaded = sink.get_records_in_table_by_data_item(item_name_)
456 update_type = LoggerEnums.SnowflakeWritesOperations.UPSERT.value \
457 if sink.endpoint_schema["info"]["opts"]["upsert"] else \
458 LoggerEnums.SnowflakeWritesOperations.INSERT.value
459 if records_loaded == 0:
460 logging.info(f"No data appeared to have made it into the sink. "
461 f"Re-processing data via {update_type} method...")
462 else:
463 logging.info(
464 f"{records_loaded:,} record(s) exist for this data item in the sink.")
465 # Check for ignore_force_merge. This will speed up processing by not creating temp/dedupe tables
466 if sink.logger.ignore_force_merge:
467 logging.info(
468 "Ignoring force merge option. Records will continue to be inserted."
469 )
470 elif update_type == LoggerEnums.SnowflakeWritesOperations.UPSERT.value:
471 logging.info(
472 f"To be safe, re-processing data via the original selected "
473 f"{update_type} method.")
474 # If force_merge is used, data that is normally inserted will first be
475 # put into a temp table, deduped, and then merged into main table.
476 elif update_type == LoggerEnums.SnowflakeWritesOperations.INSERT.value:
477 logging.info(f"To be safe, re-processing data via FORCED MERGE method.")
478 force_merge = True
480 try:
481 result = None
482 if sink.logger and sink.exchange_type in FileExchangeTypes.__members__:
483 sink.logger.update_base_table({
484 "UUID": item_uuid,
485 "TOTAL_RECORDS_WRITTEN": 0,
486 "TOTAL_RECORDS_TO_WRITE": df_record_count_by_sink ,
487 "CURRENT_EXCHANGE_STATUS": LoggerEnums.ExchangeStatuses.PROCESSING.value,
488 "CURRENT_EXCHANGE_REASON": LoggerEnums.ExchangeStatusReasons.SYNCING.value,
489 "EXCHANGE_MESSAGE": f"Total records to write is {df_record_count_by_sink :,}."
490 })
492 # Process for iterative processing runs
493 if sink.exchange_type in DatabaseExchangeTypes.__members__ \
494 and sink.logger:
495 if self.dataframe_type == SDCDFTypes.PANDAS:
496 if sink.logger.processing_type == "by_records":
497 chunk_size = sink.logger.chunk_size
498 total_runs = math.ceil(source_record_count_by_sink / chunk_size)
500 # If item previously failed, set the current_record and current_run based on logging stats
501 if item_name_ in sink.endpoint_items["sink_failed"]:
502 current_record = sink.endpoint_items["sink_failed"][item_name_][
503 "TOTAL_RECORDS_WRITTEN"]
504 current_run = math.ceil(current_record / chunk_size) + 1
505 logging.info(
506 f"Attempting to pick-up processing from PROCESSING RUN {current_run:,}/{total_runs:,}. "
507 f"{current_record:,}/{source_record_count_by_sink:,} record(s) already written."
508 )
509 # If file has never been processed, we start from record and run #1
510 else:
511 current_record = 1
512 current_run = 1
513 logging.info(
514 f"Attempting to process {source_record_count_by_sink:,} record(s) at {chunk_size} record(s) each. "
515 f"Total runs needed: {total_runs:,}")
517 # Iterative processes require a generator that outputs a dataframe.
518 # Minus 1 from current_record for zero-indexed dataframes.
519 source_data_generator = source_data.get_file_as_dataframe(
520 start_row_=current_record - 1, chunksize_=chunk_size)
522 while current_record < source_record_count_by_sink:
523 if (current_record + chunk_size) > source_record_count_by_sink:
524 chunk_size = source_record_count_by_sink - current_record
525 logging.info(f"Starting PROCESSING RUN {current_run:,}/{total_runs:,}: "
526 f"Attempting to write {chunk_size:,} record(s)...")
527 try:
528 result = sink.write_data(
529 source_data_generator,
530 item_name_,
531 force_merge_=force_merge,
532 item_uuid_=item_uuid)
534 current_record += chunk_size
535 current_run += 1
537 except Exception as e:
538 msg = f"{FileResultTypes.error.value}: Syncing {item_name_} to {sink.exchange_type} failed."
539 logging.error(f"{msg}. {e}")
540 self.etl_results_log[sink.endpoint_tag].append(msg)
541 if sink.logger:
542 sink.logger.update_base_table({
543 "UUID":
544 item_uuid,
545 "TOTAL_RECORDS_WRITTEN":
546 0,
547 "CURRENT_EXCHANGE_STATUS":
548 LoggerEnums.ExchangeStatuses.FAILED.value,
549 "CURRENT_EXCHANGE_REASON":
550 LoggerEnums.ExchangeStatusReasons.SYNCING.value,
551 "EXCHANGE_MESSAGE":
552 msg,
553 "WRITE_STATS":
554 sink.logger.write_stats
555 })
556 traceback.print_exc(file=sys.stdout)
557 continue
559 msg = f"Completed PROCESSING RUN {current_run - 1}/{total_runs}: {chunk_size:,} record(s) written."
560 logging.info(msg)
561 if sink.logger:
562 sink.logger.update_base_table({
563 "UUID":
564 item_uuid,
565 "TOTAL_RECORDS_WRITTEN":
566 chunk_size,
567 "CURRENT_EXCHANGE_STATUS":
568 LoggerEnums.ExchangeStatuses.PROCESSING.value,
569 "CURRENT_EXCHANGE_REASON":
570 LoggerEnums.ExchangeStatusReasons.SYNCING.value,
571 "EXCHANGE_MESSAGE":
572 msg,
573 "WRITE_STATS":
574 sink.logger.write_stats
575 })
577 # TODO: For incremental runs, put result as '' as total will be for the last batch of insert/upserts, not
578 # the total of entire ETL job. Find better way to show this in message/logs
579 result = ''
580 else:
581 result = sink.write_data(
582 source_data,
583 item_name_,
584 force_merge_=force_merge,
585 item_uuid_=item_uuid)
586 else:
587 raise Exception(
588 f'Datafarme type of "{self.dataframe_type} not valid for iterative processing of dataframes.'
589 )
591 elif sink.exchange_type in DatabaseExchangeTypes.__members__:
592 result = sink.write_data(
593 source_data, item_name_, force_merge_=force_merge, item_uuid_=item_uuid)
594 elif sink.exchange_type in NoSqlDatabaseExchangeTypes.__members__:
595 result = sink.write_data(source_data)
596 else:
597 result = sink.write_data(source_data, item_name_)
599 msg = f"{FileResultTypes.success.value}: Loaded {item_name_} to {sink.exchange_type}. {result}"
600 logging.info(msg)
601 self.etl_results_log[sink.endpoint_tag].append(msg)
602 if sink.logger:
603 if sink.logger.processing_type == "by_records":
604 # When processing iteratively, we nix the result numbers and the recored_count as those
605 # would represent the last chunk size and not the total amount.
606 msg = f"{FileResultTypes.success.value}: Loaded {item_name_} to {sink.exchange_type}."
607 record_count = 0
608 elif "STATS" in sink.logger.write_stats:
609 record_count = [
610 i["TOTAL_RECORDS"]
611 for i in sink.logger.write_stats["STATS"]
612 if i["DEST_TYPE"] == "MAIN"
613 ][0]
614 sink.logger.update_base_table({
615 "UUID": item_uuid,
616 "TOTAL_RECORDS_WRITTEN": record_count,
617 "CURRENT_EXCHANGE_STATUS": LoggerEnums.ExchangeStatuses.FINISHED.value,
618 "CURRENT_EXCHANGE_REASON": LoggerEnums.ExchangeStatusReasons.NONE.value,
619 "EXCHANGE_MESSAGE": msg,
620 "WRITE_STATS": sink.logger.write_stats
621 })
623 except DataframeForDBEmpty:
624 log_empty_result(item_name_)
625 continue
627 except Exception as e:
628 msg = f"{FileResultTypes.error.value}: Syncing {item_name_} to {sink.exchange_type} failed."
629 logging.error(f"{msg}. {e}")
630 self.etl_results_log[sink.endpoint_tag].append(msg)
631 if sink.logger:
632 sink.logger.update_base_table({
633 "UUID": item_uuid,
634 "TOTAL_RECORDS_WRITTEN": 0,
635 "CURRENT_EXCHANGE_STATUS": LoggerEnums.ExchangeStatuses.FAILED.value,
636 "CURRENT_EXCHANGE_REASON": LoggerEnums.ExchangeStatusReasons.SYNCING.value,
637 "EXCHANGE_MESSAGE": msg,
638 "WRITE_STATS": sink.logger.write_stats
639 })
640 traceback.print_exc(file=sys.stdout)
641 continue
643 if self.source_endpoint_schema["info"]["type"] in MockExchangeTypes.__members__:
644 sdc_df = self.source.get_data()
646 for sink in self.sinks:
647 try:
648 self.etl_results_log[sink.endpoint_tag] = []
649 result = sink.write_data(data_=sdc_df, file_name_="MOCKED_DATASET")
650 msg = f"{ExchangeResultTypes.SUCCESS.value}: Loaded mock data " \
651 f"to {sink.exchange_type}. {result}"
652 logging.info(msg)
653 self.etl_results_log[sink.endpoint_tag].append(msg)
654 except Exception as e:
655 msg = f"{ExchangeResultTypes.ERROR.value}: Syncing mock data to " \
656 f"{sink.exchange_type} failed."
657 logging.error(f"{msg}. {e}")
658 self.etl_results_log[sink.endpoint_tag].append(msg)
660 elif self.source_endpoint_schema["info"]["type"] in FileExchangeTypes.__members__:
662 # Log message if no items exist to process for a sink.
663 for sink in self.sinks:
664 self.etl_results_log[sink.endpoint_tag] = []
665 if len(self.sink_items_to_process[sink.endpoint_tag]) == 0:
666 msg = f"There were no new items to send to the {sink.exchange_type} sink."
667 logging.info(msg)
668 self.etl_results_log[sink.endpoint_tag].extend([msg])
670 num_items_to_process = len(self.sink_items_to_process[self.source.endpoint_tag])
671 logging.info(f"There are {num_items_to_process} item(s) to process.")
673 if self.process_by_block:
674 exchange_data_by_file_in_blocks(num_items_to_process)
675 else:
676 exchange_data_by_file(num_items_to_process)
678 return self.etl_results_log
680 elif self.source_endpoint_schema["info"]["type"] in DatabaseExchangeTypes.__members__\
681 or self.source_endpoint_schema["info"]["type"] in NoSqlDatabaseExchangeTypes.__members__:
683 sdc_df = self.source.get_data()
685 for sink in self.sinks:
686 self.etl_results_log[sink.endpoint_tag] = []
687 try:
688 if sink.exchange_type in FileExchangeTypes.__members__:
689 item_name = sink.file_name
690 result = sink.write_data(sdc_df, item_name)
691 else:
692 item_name = "data"
693 result = sink.write_data(sdc_df)
695 msg = f"{FileResultTypes.success.value}: Loaded " \
696 f"{item_name} to {sink.exchange_type}. {result}"
697 logging.info(msg)
698 self.etl_results_log[sink.endpoint_tag].append(msg)
700 except DataframeForDBEmpty:
701 log_empty_result(None)
702 continue
704 except Exception as e:
705 traceback.print_exc(file=sys.stdout)
706 msg = f"{FileResultTypes.error.value}: " \
707 f"Syncing {item_name} to " \
708 f"{sink.exchange_type} failed."
709 logging.error(f"{msg}. {e}")
710 self.etl_results_log[sink.endpoint_tag].append(msg)
711 continue
712 return self.etl_results_log
714 elif self.source_endpoint_schema['info']["type"] in APIExchangeTypes.__members__:
716 sourced = False
717 try:
718 sdc_df = self.source.get_data()
719 sourced = True
720 except Exception as e:
721 sourced = False
722 msg_error = f"{APIResultTypes.error.value}: Sourcing from {self.source.exchange_type} failed."
723 logging.warning(f"{msg_error}. {e}")
724 for sink in self.sinks:
725 self.etl_results_log[sink.endpoint_tag] = [msg_error]
726 if sink.logger:
727 logging.warning("Logger currently not setup for APIEndpoints. Results not logged.")
728 raise Exception(f"{msg_error}. {e}")
730 if sourced:
731 if sdc_df is None:
732 # TODO: Implement logic in https://github.com/CamelotVG/data-engineering/issues/277 to check if
733 # nodata would be considered OK (just log), or, if error should be returned. Defaulting to log right now.
734 msg_nodata = f"{APIResultTypes.nodata.value}: No data was returned from the API call."
735 logging.info(f"{msg_nodata}")
736 for sink in self.sinks:
737 self.etl_results_log[sink.endpoint_tag] = [msg_nodata]
738 if sink.logger:
739 logging.warning("Logger currently not setup for APIEndpoints. Results not logged.")
741 else:
742 for sink in self.sinks:
743 self.etl_results_log[sink.endpoint_tag] = []
744 api_name = self.source_endpoint_schema['info']['access']['api_name']
746 try:
747 if sink.exchange_type in FileExchangeTypes.__members__:
748 item_name = sink.file_name
749 result = sink.write_data(sdc_df, item_name)
750 else:
751 item_name = "data"
752 result = sink.write_data(sdc_df)
754 msg = f"{APIResultTypes.success.value}: Loaded data from API " \
755 f"{api_name} to {sink.exchange_type}. {result}"
756 logging.info(msg)
757 self.etl_results_log[sink.endpoint_tag].append(msg)
758 except Exception as e:
759 msg = f"{APIResultTypes.error.value}: Syncing {api_name} to " \
760 f"{sink.exchange_type} failed."
761 logging.error(f"{msg}. {e}")
762 self.etl_results_log[sink.endpoint_tag].append(msg)
763 continue
764 return self.etl_results_log
766 # Try to close source and sink connections, if applicable to the type.
767 self._close_connections()