Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1import logging 

2import math 

3import os 

4import sys 

5import traceback 

6import uuid 

7 

8from sdc_etl_libs.sdc_data_exchange.SDCDataExchangeEndpointFactory import \ 

9 SDCDataExchangeEndpointFactory 

10from sdc_etl_libs.sdc_data_exchange.SDCDataExchangeEnums import ( 

11 APIExchangeTypes, APIResultTypes, DatabaseExchangeTypes, DataExchangeTypes, 

12 FileExchangeTypes, FileResultTypes, NoSqlDatabaseExchangeTypes, MockExchangeTypes, ExchangeResultTypes) 

13from sdc_etl_libs.sdc_data_exchange_loggers.SDCSnowflakeLogger import \ 

14 SDCSnowflakeLoggerEnums as LoggerEnums 

15from sdc_etl_libs.sdc_data_schema.schema_toolbox import SchemaToolbox 

16from sdc_etl_libs.sdc_data_schema.schema_validation import SchemaValidation 

17from sdc_etl_libs.sdc_dataframe.Dataframe import Dataframe 

18from sdc_etl_libs.sdc_dataframe.SDCDataframeEnums import SDCDFTypes 

19from sdc_etl_libs.sdc_exchange_endpoints.database_endpoints.SDCSnowflakeEndpoint import DataframeForDBEmpty 

20from sdc_etl_libs.sdc_file_helpers.SDCFileFactory import SDCFileFactory 

21from sdc_etl_libs.sdc_file_helpers.SDCFileHelpers import SDCFileHelpers 

22from sdc_etl_libs.sdc_filetransfer.utils.delete_files_mechanism.FilenameMatchDeleteCriteria import \ 

23 FilenameMatchDeleteCriteria 

24from sdc_etl_libs.sdc_dataframe.sdc_dataframe_helpers import SDCDataframeHelpers 

25 

26 

27class SDCDataExchange: 

28 

29 # TODO: This will have to be updated to account for the new schema key layout 

30 

31 def __init__(self, schema_name_, source_endpoint_tag_, sink_endpoint_tag_, **kwargs): 

32 """ 

33 SDCDataExchange constructor. Creates source and sink instances and 

34 compare data lists between endpoints (if applicable to types). 

35 :param schema_name_: Name of data schema. 

36 :param source_endpoint_tag_: Tag of source endpoint (from data schema). 

37 :param sink_endpoint_tag_: Tag of source endpoint (from data schema). 

38 """ 

39 

40 self.source_endpoint_schema = None 

41 self.source_endpoint_uuid = None 

42 self.source = None 

43 self.sinks = [] 

44 self.data_schema = None 

45 

46 self.source_endpoint_tag = source_endpoint_tag_ 

47 self.sink_endpoint_tag = sink_endpoint_tag_ 

48 self.etl_results_log = {} 

49 self.sink_items_to_process = {} 

50 

51 # Setup Source and grab Source information 

52 

53 self.get_data_schema(schema_name_) 

54 

55 schema_validation = SchemaValidation() 

56 self.data_schema = schema_validation.validate_schema(self.data_schema) 

57 self.source_endpoint_schema = SchemaToolbox.get_endpoint_data_from_schema(self.data_schema, 

58 self.source_endpoint_tag) 

59 

60 self.dataframe_type = SchemaToolbox.determine_dataframe_type_from_schema(self.data_schema) 

61 

62 self.source = SDCDataExchangeEndpointFactory.get_endpoint(self.data_schema, 

63 self.source_endpoint_schema, 

64 **kwargs) 

65 self.sink_setup(**kwargs) 

66 

67 self.process_by_block = False 

68 

69 if SDCFileHelpers.is_file_endpoint_type(self.source_endpoint_schema["info"]["type"]) and \ 

70 self.source.endpoint_items["source"] is not None: 

71 distinct_sink_items = set() 

72 for sink in self.sinks: 

73 distinct_sink_items.update(self.sink_items_to_process[sink.endpoint_tag]) 

74 self.sink_items_to_process[self.source.endpoint_tag] = distinct_sink_items 

75 self.process_by_block = self.source_endpoint_schema["info"]["opts"]["process_by_block"] 

76 

77 def get_data_schema(self, schema_name_): 

78 """ 

79 Retrieves SDC data schema from give schema name and puts in self.data_schema. 

80 :param schema_name_: String. Name of data schema. 

81 :return: None. 

82 """ 

83 

84 schema_registry_api_url = os.getenv('schema_registry_api_url') 

85 namespace = schema_name_.split('/')[0] 

86 name = schema_name_.split('/')[1] 

87 self.data_schema = SchemaToolbox.get_schema( 

88 schema_registry_api_url_=schema_registry_api_url, 

89 namespace_=namespace, 

90 name_=name, 

91 local_schema_path_=schema_name_) 

92 

93 def sink_setup(self, **kwargs): 

94 """ 

95 Sets up the sinks from the from list of sink tags passed in, including: 

96 - Validating sink schema. 

97 - Setting sink as it's endpoint class. 

98 - Determining what files need to synced from source. 

99 :return: None. 

100 """ 

101 

102 # One or more sinks can be passed. Ensure sink_endpoint_tag is a list. 

103 if isinstance(self.sink_endpoint_tag, str): 

104 temp_list = [] 

105 temp_list.append(self.sink_endpoint_tag) 

106 self.sink_endpoint_tag = temp_list.copy() 

107 

108 for tag in self.sink_endpoint_tag: 

109 

110 ep_schema = SchemaToolbox.get_endpoint_data_from_schema(self.data_schema, tag) 

111 

112 # Get the endpoint class for the sink. 

113 sink = SDCDataExchangeEndpointFactory.get_endpoint(self.data_schema, ep_schema, **kwargs) 

114 

115 # Optionally delete existing data in sink. This should be done 

116 # before SDCDataExchange.exchange_data() is run. 

117 # As of 12/9/2020, this functionality is used exclusively by  

118 # EndpointIntegrationTest.test_data_exchange()) 

119 if 'delete_sink_data' in kwargs and kwargs['delete_sink_data']: 

120 delete_criteria = None 

121 

122 # Execution varies by sink exchange type 

123 if sink.exchange_type in FileExchangeTypes.__members__: 

124 delete_criteria = [FilenameMatchDeleteCriteria(file_name=sink.file_name)] 

125 delete_msg = f"Attempting to delete '{tag}' files with the filename '{sink.file_name}' if any..." 

126 elif sink.exchange_type in DatabaseExchangeTypes.__members__: 

127 delete_msg = f"Attempting to delete '{tag}' data..." 

128 else: 

129 raise Exception( 

130 f"The '{tag}' endpoint class does not support 'delete_sink_data' functionality at this time.") 

131 

132 logging.info(delete_msg) 

133 

134 if delete_criteria: 

135 sink.delete_data(delete_criteria) 

136 else: 

137 sink.delete_data() 

138 

139 # If a logger is requested, setup things needed for logger. 

140 if sink.logger: 

141 sink = self.__logger_setup(sink, ep_schema) 

142 self.source.logger = True 

143 

144 # some quick maintenance before we create new merge tables 

145 if sink.exchange_type == DataExchangeTypes.snowflake.value: 

146 if sink.upsert: 

147 sink.clean_old_temp_tables() 

148 

149 self.sinks.append(sink) 

150 

151 #TODO figure out better logic for the below. 

152 # E.g. in case of moving file from S3/csv to Dynamodb how to apply that? 

153 # Determine which files need to be sent from source to sink. 

154 if SDCFileHelpers.is_file_endpoint_type(self.source_endpoint_schema["info"]["type"]): 

155 self.sink_items_to_process[sink.endpoint_tag] = \ 

156 None if self.source.endpoint_items["source"] is None \ 

157 else SDCDataExchange.compare_item_lists(self.source, sink) 

158 

159 def __logger_setup(self, sink_, data_schema_): 

160 """ 

161 Setups up needed functionality for a sink logger. 

162 :param sink_: An SDCDataExchange sink endpoint object. 

163 :param data_schema_: Endpoint schema. 

164 :return: An SDCDataExchange sink endpoint object. 

165 """ 

166 

167 # If logger is used for any sink. need to set a UUID for the source. 

168 self.source.endpoint_uuid = sink_.logger.generate_endpoint_uuid(self.source_endpoint_schema, True) 

169 

170 # Setup the base values for the logger insert message. DATA_ITEM_NAME and MESSAGE will be updated when used. 

171 sink_.logger_insert_message = { 

172 "SCHEMA_NAMESPACE": self.data_schema["namespace"], 

173 "SCHEMA_NAME": self.data_schema["name"], 

174 "DATA_SOURCE_TAG": self.source_endpoint_schema["tag"].upper(), 

175 "DATA_SOURCE_TYPE": self.source_endpoint_schema["info"]["type"].upper(), 

176 "DATA_SOURCE_ENDPOINT_UUID": self.source.endpoint_uuid, 

177 "DATA_SINK_TAG": sink_.endpoint_schema["tag"], 

178 "DATA_SINK_TYPE": sink_.endpoint_schema["info"]["type"].upper(), 

179 "DATA_SINK_ENDPOINT_UUID": sink_.endpoint_uuid, 

180 "DATA_ITEM_NAME": None, 

181 "EXCHANGE_MESSAGE": None 

182 } 

183 

184 return sink_ 

185 

186 @staticmethod 

187 def compare_item_lists(source_, sink_): 

188 """ 

189 Generates a list of data items in the source but not in the sink. 

190 :param source_: Endpoint object. An SDCDataExchange source endpoint object. 

191 :param sink_: Endpoint object. An SDCDataExchange sink endpoint object. 

192 :return: List. Data item names that need to be sycned (e.g. file names, etc.) 

193 """ 

194 

195 if not isinstance(source_.endpoint_items["source"], list) or \ 

196 not isinstance(sink_.endpoint_items["sink_successful"], list) or \ 

197 not isinstance(sink_.endpoint_items["sink_processing"], list): 

198 raise Exception("Files for source and sink must be passed as Lists.") 

199 

200 source_data = set(source_.endpoint_items["source"]) 

201 sink_successful = set(sink_.endpoint_items["sink_successful"]) 

202 sink_still_processing = set(sink_.endpoint_items["sink_processing"]) 

203 

204 if sink_still_processing: 

205 logging.info(f"These data items are considered to still be in process: {sink_still_processing}") 

206 

207 # The items that need to be processed are the ones in the source and not in the sink and less 

208 # and items that the logger considers still in process. 

209 data_list = list(source_data.difference(sink_successful).difference(sink_still_processing)) 

210 

211 return data_list 

212 

213 def _close_connections(self): 

214 """ 

215 close the source and sink connection 

216 :return: None 

217 """ 

218 try: 

219 self.source.close_connection() 

220 except Exception: 

221 pass 

222 

223 for sink in self.sinks: 

224 try: 

225 sink.close_connection() 

226 except Exception: 

227 pass 

228 

229 def _get_items_by_sink(self, sink_, items_to_process_): 

230 """ 

231 returns the list of items to be processed by sink 

232 :param sink_: An SDCDataExchange sink endpoint object. 

233 :param items_to_process_: an List[str] with the name of items to process 

234 :return: List[str] 

235 """ 

236 return list(item for item in filter(lambda tag: tag in self.sink_items_to_process[sink_.endpoint_tag], 

237 items_to_process_)) 

238 

239 def _get_df_record_count(self, sdcDataframe): 

240 if isinstance(sdcDataframe, Dataframe): 

241 sdcDataframe.shape = sdcDataframe.df.shape 

242 return sdcDataframe.shape[0] 

243 else: 

244 return 0 

245 

246 def exchange_data(self): 

247 """ 

248 Exchanges data between a source and a one or more sinks. 

249 :return: Dict. Results log of information for each item's exchange attempt. 

250 """ 

251 self.etl_results_log = {} 

252 

253 def generate_data_item_uuid(data_source_endpoint_uuid_, data_sink_endpoint_uuid_, data_item_name_): 

254 item_uuid = sink.logger.generate_uuid({ 

255 "DATA_SOURCE_ENDPOINT_UUID": data_source_endpoint_uuid_, 

256 "DATA_SINK_ENDPOINT_UUID": data_sink_endpoint_uuid_, 

257 "DATA_ITEM_NAME": data_item_name_ 

258 }) 

259 return item_uuid 

260 

261 def log_new_data_item(item_to_process_): 

262 for sink in self.sinks: 

263 if item_to_process_ in self.sink_items_to_process[sink.endpoint_tag]: 

264 if sink.logger: 

265 sink.logger_insert_message["DATA_ITEM_NAME"] = item_to_process_ 

266 sink.logger.add_item_to_base_table({ 

267 "SCHEMA_NAMESPACE": self.data_schema["namespace"], 

268 "SCHEMA_NAME": self.data_schema["name"], 

269 "DATA_SOURCE_TAG": self.source_endpoint_schema["tag"].upper(), 

270 "DATA_SOURCE_TYPE": self.source_endpoint_schema["info"]["type"].upper(), 

271 "DATA_SOURCE_ENDPOINT_UUID": self.source.endpoint_uuid, 

272 "DATA_SINK_TAG": sink.endpoint_schema["tag"], 

273 "DATA_SINK_TYPE": sink.endpoint_schema["info"]["type"].upper(), 

274 "DATA_SINK_ENDPOINT_UUID": sink.endpoint_uuid, 

275 "DATA_ITEM_NAME": item_to_process_, 

276 "EXCHANGE_MESSAGE": "Exchange started." 

277 }) 

278 

279 def log_source_record_count(item_to_process_, source_record_count_): 

280 for sink in self.sinks: 

281 if item_to_process_ in self.sink_items_to_process[sink.endpoint_tag]: 

282 if sink.logger: 

283 item_uuid = generate_data_item_uuid(self.source.endpoint_uuid, sink.endpoint_uuid, 

284 item_to_process_) 

285 sink.logger.update_base_table({ 

286 "UUID": item_uuid, 

287 "TOTAL_RECORDS_WRITTEN": 0, 

288 "TOTAL_RECORDS_FROM_SOURCE": source_record_count_, 

289 "CURRENT_EXCHANGE_STATUS": LoggerEnums.ExchangeStatuses.PROCESSING.value, 

290 "CURRENT_EXCHANGE_REASON": LoggerEnums.ExchangeStatusReasons.SOURCING.value, 

291 "EXCHANGE_MESSAGE": f"Source record count is {source_record_count_:,}." 

292 }) 

293 

294 def log_empty_result(item_to_process_): 

295 msg_empty = f"{FileResultTypes.empty.value}: {item_to_process_} contains no data and was skipped." 

296 logging.error(f"{msg_empty}") 

297 for sink in self.sinks: 

298 if item_to_process_ in self.sink_items_to_process[sink.endpoint_tag]: 

299 self.etl_results_log[sink.endpoint_tag].append(msg_empty) 

300 if sink.logger: 

301 item_uuid = generate_data_item_uuid(self.source.endpoint_uuid, sink.endpoint_uuid, 

302 item_to_process_) 

303 sink.logger.update_base_table({ 

304 "UUID": item_uuid, 

305 "TOTAL_RECORDS_WRITTEN": 0, 

306 "CURRENT_EXCHANGE_STATUS": LoggerEnums.ExchangeStatuses.EMPTY.value, 

307 "CURRENT_EXCHANGE_REASON": LoggerEnums.ExchangeStatusReasons.NONE.value, 

308 "EXCHANGE_MESSAGE": msg_empty 

309 }) 

310 

311 def log_dataframe_record_count(item_to_process_, df_record_count): 

312 for sink in self.sinks: 

313 if item_to_process_ in self.sink_items_to_process[sink.endpoint_tag]: 

314 if sink.logger: 

315 item_uuid = generate_data_item_uuid(self.source.endpoint_uuid, sink.endpoint_uuid, 

316 item_to_process_) 

317 sink.logger.update_base_table({ 

318 "UUID": item_uuid, 

319 "TOTAL_RECORDS_WRITTEN": 0, 

320 "TOTAL_RECORDS_FROM_DATAFRAME": df_record_count, 

321 "CURRENT_EXCHANGE_STATUS": LoggerEnums.ExchangeStatuses.PROCESSING.value, 

322 "CURRENT_EXCHANGE_REASON": LoggerEnums.ExchangeStatusReasons.SOURCING.value, 

323 "EXCHANGE_MESSAGE": f"Dataframe record count is {df_record_count:,}." 

324 }) 

325 

326 def log_failed_sourcing(item_to_process_, error_): 

327 msg = f"{FileResultTypes.error.value}: Sourcing {item_to_process_} from {self.source.exchange_type} failed." 

328 logging.warning(f"{msg}. {error_}") 

329 

330 for sink in self.sinks: 

331 if item_to_process_ in self.sink_items_to_process[sink.endpoint_tag]: 

332 self.etl_results_log[sink.endpoint_tag].append(msg) 

333 if sink.logger: 

334 item_uuid = generate_data_item_uuid(self.source.endpoint_uuid, sink.endpoint_uuid, 

335 item_to_process_) 

336 sink.logger.update_base_table({ 

337 "UUID": item_uuid, 

338 "TOTAL_RECORDS_WRITTEN": 0, 

339 "CURRENT_EXCHANGE_STATUS": LoggerEnums.ExchangeStatuses.FAILED.value, 

340 "CURRENT_EXCHANGE_REASON": LoggerEnums.ExchangeStatusReasons.SOURCING.value, 

341 "EXCHANGE_MESSAGE": msg 

342 }) 

343 

344 def exchange_data_by_file_in_blocks(num_items_to_process_): 

345 """ 

346 Exchanges data between a source and a one or more sinks. by blocks. 

347 :param num_items_to_process_: int. Total of items to process 

348 :return: Dict. Results log of information for each item's exchange attempt. 

349 """ 

350 items_processed = [] 

351 block_start_number = 1 

352 sdc_df_to_save = Dataframe(None, self.data_schema) 

353 

354 def execute_sink_by_block(item_no_): 

355 item_name = "block" 

356 logging.info( 

357 f"Processing {item_name}. {block_start_number} to {item_no_+1}/{num_items_to_process_} files " 

358 f"with {sdc_df_to_save.shape[0]} rows") 

359 execute_file_based_sinks(sdc_df_to_save, items_processed, item_name) 

360 

361 for item_no, item_to_process in enumerate(self.sink_items_to_process[self.source.endpoint_tag]): 

362 try: 

363 source_data, source_record_count = self.source.get_data(item_to_process) 

364 new_sdc_df = source_data.get_file_as_dataframe() 

365 

366 # TODO: Remove this line when get_file_as_dataframe() method returns _ETL_FILENAME with the name 

367 # of the file that we are processing instead of returning a date. 

368 new_sdc_df.df["_ETL_FILENAME"] = item_to_process 

369 

370 except Exception as error: 

371 log_failed_sourcing(item_to_process, error) 

372 traceback.print_exc(file=sys.stdout) 

373 continue 

374 

375 sdc_df_to_save = SDCDataframeHelpers.concat_sdc_dataframe(self.data_schema, sdc_df_to_save, new_sdc_df) 

376 items_processed.append((item_to_process, source_record_count or 0)) 

377 if sdc_df_to_save.shape[0] >= self.source_endpoint_schema["info"]["opts"]["rows_by_block"]: 

378 execute_sink_by_block(item_no) 

379 items_processed = [] 

380 block_start_number = item_no + 2 

381 sdc_df_to_save = Dataframe(None, self.data_schema) 

382 

383 if sdc_df_to_save.shape[0] > 0: 

384 execute_sink_by_block(item_no) 

385 

386 def exchange_data_by_file(num_items_to_process_): 

387 """ 

388 Exchanges data between a source and a one or more sinks. file by file. 

389 :param num_items_to_process_: int. Total of items to process 

390 :return: Dict. Results log of information for each item's exchange attempt. 

391 """ 

392 for item_no, item_to_process in enumerate(self.sink_items_to_process[self.source.endpoint_tag]): 

393 logging.info(f"Processing {item_to_process} (Item {item_no + 1}/{num_items_to_process_})...") 

394 

395 log_new_data_item(item_to_process) 

396 source_data = None 

397 try: 

398 source_data, source_record_count = self.source.get_data(item_to_process) 

399 df_record_count = 0 

400 

401 if SDCFileFactory.is_type_sdcfile(source_data): 

402 if source_data.compression_type != "gzip": 

403 source_data = source_data.get_file_as_dataframe() 

404 source_data.shape = source_data.df.shape 

405 df_record_count = source_data.shape[0] 

406 log_dataframe_record_count(item_to_process, df_record_count) 

407 

408 if df_record_count == 0: 

409 log_empty_result(item_to_process) 

410 continue 

411 log_source_record_count(item_to_process, source_record_count) 

412 except Exception as error: 

413 log_failed_sourcing(item_to_process, error) 

414 traceback.print_exc(file=sys.stdout) 

415 continue 

416 

417 execute_file_based_sinks(source_data, [(item_to_process, source_record_count or 0)], item_to_process) 

418 

419 def execute_file_based_sinks(source_data_, items_to_process_, item_name_): 

420 for sink in self.sinks: 

421 items_to_process_by_sink = self._get_items_by_sink(sink, list(i[0] for i in items_to_process_)) 

422 source_record_count_by_sink = sum(i[1] for i in filter( 

423 lambda tag: tag[0] in self.sink_items_to_process[sink.endpoint_tag], items_to_process_)) 

424 

425 if len(items_to_process_by_sink) > 0: 

426 

427 if self.process_by_block: 

428 source_data = Dataframe(None, self.data_schema) 

429 df_filtered = source_data_.df.loc[source_data_.df['_ETL_FILENAME'].isin( 

430 self.sink_items_to_process[sink.endpoint_tag])] 

431 source_data.df = df_filtered 

432 else: 

433 source_data = source_data_ 

434 

435 df_record_count_by_sink = self._get_df_record_count(source_data) 

436 

437 item_uuid = None 

438 force_merge = False 

439 if sink.logger: 

440 # Set some columns to increment on update when doing iterative loads so numbers don't get overwritten 

441 if sink.logger.processing_type == "by_records": 

442 sink.logger.make_incremental_cols = [ 

443 "TOTAL_RECORDS_FROM_DATAFRAME", "TOTAL_RECORDS_TO_WRITE" 

444 ] 

445 item_uuid = generate_data_item_uuid(self.source.endpoint_uuid, sink.endpoint_uuid, 

446 item_name_) 

447 sink.logger.write_stats = {} 

448 

449 # Process for handling failed files 

450 if sink.logger and sink.exchange_type in DatabaseExchangeTypes.__members__: 

451 if item_name_ in sink.endpoint_items["sink_failed"]: 

452 logging.info( 

453 f'{item_name_} had an existing FAILED status for {sink.endpoint_schema["tag"]}. ' 

454 f'Checking if any data made it over...') 

455 records_loaded = sink.get_records_in_table_by_data_item(item_name_) 

456 update_type = LoggerEnums.SnowflakeWritesOperations.UPSERT.value \ 

457 if sink.endpoint_schema["info"]["opts"]["upsert"] else \ 

458 LoggerEnums.SnowflakeWritesOperations.INSERT.value 

459 if records_loaded == 0: 

460 logging.info(f"No data appeared to have made it into the sink. " 

461 f"Re-processing data via {update_type} method...") 

462 else: 

463 logging.info( 

464 f"{records_loaded:,} record(s) exist for this data item in the sink.") 

465 # Check for ignore_force_merge. This will speed up processing by not creating temp/dedupe tables 

466 if sink.logger.ignore_force_merge: 

467 logging.info( 

468 "Ignoring force merge option. Records will continue to be inserted." 

469 ) 

470 elif update_type == LoggerEnums.SnowflakeWritesOperations.UPSERT.value: 

471 logging.info( 

472 f"To be safe, re-processing data via the original selected " 

473 f"{update_type} method.") 

474 # If force_merge is used, data that is normally inserted will first be 

475 # put into a temp table, deduped, and then merged into main table. 

476 elif update_type == LoggerEnums.SnowflakeWritesOperations.INSERT.value: 

477 logging.info(f"To be safe, re-processing data via FORCED MERGE method.") 

478 force_merge = True 

479 

480 try: 

481 result = None 

482 if sink.logger and sink.exchange_type in FileExchangeTypes.__members__: 

483 sink.logger.update_base_table({ 

484 "UUID": item_uuid, 

485 "TOTAL_RECORDS_WRITTEN": 0, 

486 "TOTAL_RECORDS_TO_WRITE": df_record_count_by_sink , 

487 "CURRENT_EXCHANGE_STATUS": LoggerEnums.ExchangeStatuses.PROCESSING.value, 

488 "CURRENT_EXCHANGE_REASON": LoggerEnums.ExchangeStatusReasons.SYNCING.value, 

489 "EXCHANGE_MESSAGE": f"Total records to write is {df_record_count_by_sink :,}." 

490 }) 

491 

492 # Process for iterative processing runs 

493 if sink.exchange_type in DatabaseExchangeTypes.__members__ \ 

494 and sink.logger: 

495 if self.dataframe_type == SDCDFTypes.PANDAS: 

496 if sink.logger.processing_type == "by_records": 

497 chunk_size = sink.logger.chunk_size 

498 total_runs = math.ceil(source_record_count_by_sink / chunk_size) 

499 

500 # If item previously failed, set the current_record and current_run based on logging stats 

501 if item_name_ in sink.endpoint_items["sink_failed"]: 

502 current_record = sink.endpoint_items["sink_failed"][item_name_][ 

503 "TOTAL_RECORDS_WRITTEN"] 

504 current_run = math.ceil(current_record / chunk_size) + 1 

505 logging.info( 

506 f"Attempting to pick-up processing from PROCESSING RUN {current_run:,}/{total_runs:,}. " 

507 f"{current_record:,}/{source_record_count_by_sink:,} record(s) already written." 

508 ) 

509 # If file has never been processed, we start from record and run #1 

510 else: 

511 current_record = 1 

512 current_run = 1 

513 logging.info( 

514 f"Attempting to process {source_record_count_by_sink:,} record(s) at {chunk_size} record(s) each. " 

515 f"Total runs needed: {total_runs:,}") 

516 

517 # Iterative processes require a generator that outputs a dataframe. 

518 # Minus 1 from current_record for zero-indexed dataframes. 

519 source_data_generator = source_data.get_file_as_dataframe( 

520 start_row_=current_record - 1, chunksize_=chunk_size) 

521 

522 while current_record < source_record_count_by_sink: 

523 if (current_record + chunk_size) > source_record_count_by_sink: 

524 chunk_size = source_record_count_by_sink - current_record 

525 logging.info(f"Starting PROCESSING RUN {current_run:,}/{total_runs:,}: " 

526 f"Attempting to write {chunk_size:,} record(s)...") 

527 try: 

528 result = sink.write_data( 

529 source_data_generator, 

530 item_name_, 

531 force_merge_=force_merge, 

532 item_uuid_=item_uuid) 

533 

534 current_record += chunk_size 

535 current_run += 1 

536 

537 except Exception as e: 

538 msg = f"{FileResultTypes.error.value}: Syncing {item_name_} to {sink.exchange_type} failed." 

539 logging.error(f"{msg}. {e}") 

540 self.etl_results_log[sink.endpoint_tag].append(msg) 

541 if sink.logger: 

542 sink.logger.update_base_table({ 

543 "UUID": 

544 item_uuid, 

545 "TOTAL_RECORDS_WRITTEN": 

546 0, 

547 "CURRENT_EXCHANGE_STATUS": 

548 LoggerEnums.ExchangeStatuses.FAILED.value, 

549 "CURRENT_EXCHANGE_REASON": 

550 LoggerEnums.ExchangeStatusReasons.SYNCING.value, 

551 "EXCHANGE_MESSAGE": 

552 msg, 

553 "WRITE_STATS": 

554 sink.logger.write_stats 

555 }) 

556 traceback.print_exc(file=sys.stdout) 

557 continue 

558 

559 msg = f"Completed PROCESSING RUN {current_run - 1}/{total_runs}: {chunk_size:,} record(s) written." 

560 logging.info(msg) 

561 if sink.logger: 

562 sink.logger.update_base_table({ 

563 "UUID": 

564 item_uuid, 

565 "TOTAL_RECORDS_WRITTEN": 

566 chunk_size, 

567 "CURRENT_EXCHANGE_STATUS": 

568 LoggerEnums.ExchangeStatuses.PROCESSING.value, 

569 "CURRENT_EXCHANGE_REASON": 

570 LoggerEnums.ExchangeStatusReasons.SYNCING.value, 

571 "EXCHANGE_MESSAGE": 

572 msg, 

573 "WRITE_STATS": 

574 sink.logger.write_stats 

575 }) 

576 

577 # TODO: For incremental runs, put result as '' as total will be for the last batch of insert/upserts, not 

578 # the total of entire ETL job. Find better way to show this in message/logs 

579 result = '' 

580 else: 

581 result = sink.write_data( 

582 source_data, 

583 item_name_, 

584 force_merge_=force_merge, 

585 item_uuid_=item_uuid) 

586 else: 

587 raise Exception( 

588 f'Datafarme type of "{self.dataframe_type} not valid for iterative processing of dataframes.' 

589 ) 

590 

591 elif sink.exchange_type in DatabaseExchangeTypes.__members__: 

592 result = sink.write_data( 

593 source_data, item_name_, force_merge_=force_merge, item_uuid_=item_uuid) 

594 elif sink.exchange_type in NoSqlDatabaseExchangeTypes.__members__: 

595 result = sink.write_data(source_data) 

596 else: 

597 result = sink.write_data(source_data, item_name_) 

598 

599 msg = f"{FileResultTypes.success.value}: Loaded {item_name_} to {sink.exchange_type}. {result}" 

600 logging.info(msg) 

601 self.etl_results_log[sink.endpoint_tag].append(msg) 

602 if sink.logger: 

603 if sink.logger.processing_type == "by_records": 

604 # When processing iteratively, we nix the result numbers and the recored_count as those 

605 # would represent the last chunk size and not the total amount. 

606 msg = f"{FileResultTypes.success.value}: Loaded {item_name_} to {sink.exchange_type}." 

607 record_count = 0 

608 elif "STATS" in sink.logger.write_stats: 

609 record_count = [ 

610 i["TOTAL_RECORDS"] 

611 for i in sink.logger.write_stats["STATS"] 

612 if i["DEST_TYPE"] == "MAIN" 

613 ][0] 

614 sink.logger.update_base_table({ 

615 "UUID": item_uuid, 

616 "TOTAL_RECORDS_WRITTEN": record_count, 

617 "CURRENT_EXCHANGE_STATUS": LoggerEnums.ExchangeStatuses.FINISHED.value, 

618 "CURRENT_EXCHANGE_REASON": LoggerEnums.ExchangeStatusReasons.NONE.value, 

619 "EXCHANGE_MESSAGE": msg, 

620 "WRITE_STATS": sink.logger.write_stats 

621 }) 

622 

623 except DataframeForDBEmpty: 

624 log_empty_result(item_name_) 

625 continue 

626 

627 except Exception as e: 

628 msg = f"{FileResultTypes.error.value}: Syncing {item_name_} to {sink.exchange_type} failed." 

629 logging.error(f"{msg}. {e}") 

630 self.etl_results_log[sink.endpoint_tag].append(msg) 

631 if sink.logger: 

632 sink.logger.update_base_table({ 

633 "UUID": item_uuid, 

634 "TOTAL_RECORDS_WRITTEN": 0, 

635 "CURRENT_EXCHANGE_STATUS": LoggerEnums.ExchangeStatuses.FAILED.value, 

636 "CURRENT_EXCHANGE_REASON": LoggerEnums.ExchangeStatusReasons.SYNCING.value, 

637 "EXCHANGE_MESSAGE": msg, 

638 "WRITE_STATS": sink.logger.write_stats 

639 }) 

640 traceback.print_exc(file=sys.stdout) 

641 continue 

642 

643 if self.source_endpoint_schema["info"]["type"] in MockExchangeTypes.__members__: 

644 sdc_df = self.source.get_data() 

645 

646 for sink in self.sinks: 

647 try: 

648 self.etl_results_log[sink.endpoint_tag] = [] 

649 result = sink.write_data(data_=sdc_df, file_name_="MOCKED_DATASET") 

650 msg = f"{ExchangeResultTypes.SUCCESS.value}: Loaded mock data " \ 

651 f"to {sink.exchange_type}. {result}" 

652 logging.info(msg) 

653 self.etl_results_log[sink.endpoint_tag].append(msg) 

654 except Exception as e: 

655 msg = f"{ExchangeResultTypes.ERROR.value}: Syncing mock data to " \ 

656 f"{sink.exchange_type} failed." 

657 logging.error(f"{msg}. {e}") 

658 self.etl_results_log[sink.endpoint_tag].append(msg) 

659 

660 elif self.source_endpoint_schema["info"]["type"] in FileExchangeTypes.__members__: 

661 

662 # Log message if no items exist to process for a sink. 

663 for sink in self.sinks: 

664 self.etl_results_log[sink.endpoint_tag] = [] 

665 if len(self.sink_items_to_process[sink.endpoint_tag]) == 0: 

666 msg = f"There were no new items to send to the {sink.exchange_type} sink." 

667 logging.info(msg) 

668 self.etl_results_log[sink.endpoint_tag].extend([msg]) 

669 

670 num_items_to_process = len(self.sink_items_to_process[self.source.endpoint_tag]) 

671 logging.info(f"There are {num_items_to_process} item(s) to process.") 

672 

673 if self.process_by_block: 

674 exchange_data_by_file_in_blocks(num_items_to_process) 

675 else: 

676 exchange_data_by_file(num_items_to_process) 

677 

678 return self.etl_results_log 

679 

680 elif self.source_endpoint_schema["info"]["type"] in DatabaseExchangeTypes.__members__\ 

681 or self.source_endpoint_schema["info"]["type"] in NoSqlDatabaseExchangeTypes.__members__: 

682 

683 sdc_df = self.source.get_data() 

684 

685 for sink in self.sinks: 

686 self.etl_results_log[sink.endpoint_tag] = [] 

687 try: 

688 if sink.exchange_type in FileExchangeTypes.__members__: 

689 item_name = sink.file_name 

690 result = sink.write_data(sdc_df, item_name) 

691 else: 

692 item_name = "data" 

693 result = sink.write_data(sdc_df) 

694 

695 msg = f"{FileResultTypes.success.value}: Loaded " \ 

696 f"{item_name} to {sink.exchange_type}. {result}" 

697 logging.info(msg) 

698 self.etl_results_log[sink.endpoint_tag].append(msg) 

699 

700 except DataframeForDBEmpty: 

701 log_empty_result(None) 

702 continue 

703 

704 except Exception as e: 

705 traceback.print_exc(file=sys.stdout) 

706 msg = f"{FileResultTypes.error.value}: " \ 

707 f"Syncing {item_name} to " \ 

708 f"{sink.exchange_type} failed." 

709 logging.error(f"{msg}. {e}") 

710 self.etl_results_log[sink.endpoint_tag].append(msg) 

711 continue 

712 return self.etl_results_log 

713 

714 elif self.source_endpoint_schema['info']["type"] in APIExchangeTypes.__members__: 

715 

716 sourced = False 

717 try: 

718 sdc_df = self.source.get_data() 

719 sourced = True 

720 except Exception as e: 

721 sourced = False 

722 msg_error = f"{APIResultTypes.error.value}: Sourcing from {self.source.exchange_type} failed." 

723 logging.warning(f"{msg_error}. {e}") 

724 for sink in self.sinks: 

725 self.etl_results_log[sink.endpoint_tag] = [msg_error] 

726 if sink.logger: 

727 logging.warning("Logger currently not setup for APIEndpoints. Results not logged.") 

728 raise Exception(f"{msg_error}. {e}") 

729 

730 if sourced: 

731 if sdc_df is None: 

732 # TODO: Implement logic in https://github.com/CamelotVG/data-engineering/issues/277 to check if 

733 # nodata would be considered OK (just log), or, if error should be returned. Defaulting to log right now. 

734 msg_nodata = f"{APIResultTypes.nodata.value}: No data was returned from the API call." 

735 logging.info(f"{msg_nodata}") 

736 for sink in self.sinks: 

737 self.etl_results_log[sink.endpoint_tag] = [msg_nodata] 

738 if sink.logger: 

739 logging.warning("Logger currently not setup for APIEndpoints. Results not logged.") 

740 

741 else: 

742 for sink in self.sinks: 

743 self.etl_results_log[sink.endpoint_tag] = [] 

744 api_name = self.source_endpoint_schema['info']['access']['api_name'] 

745 

746 try: 

747 if sink.exchange_type in FileExchangeTypes.__members__: 

748 item_name = sink.file_name 

749 result = sink.write_data(sdc_df, item_name) 

750 else: 

751 item_name = "data" 

752 result = sink.write_data(sdc_df) 

753 

754 msg = f"{APIResultTypes.success.value}: Loaded data from API " \ 

755 f"{api_name} to {sink.exchange_type}. {result}" 

756 logging.info(msg) 

757 self.etl_results_log[sink.endpoint_tag].append(msg) 

758 except Exception as e: 

759 msg = f"{APIResultTypes.error.value}: Syncing {api_name} to " \ 

760 f"{sink.exchange_type} failed." 

761 logging.error(f"{msg}. {e}") 

762 self.etl_results_log[sink.endpoint_tag].append(msg) 

763 continue 

764 return self.etl_results_log 

765 

766 # Try to close source and sink connections, if applicable to the type. 

767 self._close_connections()