Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1 

2import datetime 

3import hashlib 

4import logging 

5 

6from sdc_etl_libs.sdc_credentials.sdc_endpoint_credentials import SDCEndpointCredentials 

7from sdc_etl_libs.database_helpers.SnowflakeDatabase import SnowflakeDatabase 

8from sdc_etl_libs.sdc_data_exchange_loggers.SDCSnowflakeLogger import SDCSnowflakeLoggerEnums as LoggerEnums 

9 

10logging.basicConfig(format='%(levelname)s: %(asctime)s: %(funcName)s: %(message)s') 

11logger = logging.getLogger(__name__) 

12logger.setLevel(logging.INFO) 

13 

14 

15class SDCSnowflakeLogger: 

16 

17 def __init__(self, logging_info_: dict): 

18 """ 

19 Establishes table requirements and makes connection to Snowflake for logger. 

20 

21 :param logging_info_: Dict. Logging info parameters from a data schema sink. 

22 This will generally come from a data schema endpoint section. For more information on 

23 how this section is setup, see documentation at: 

24 https://github.com/CamelotVG/data-engineering/blob/master/libs/sdc_etl_libs/sdc_data_schema/README.md#endpoint_info_key_logger 

25 

26 Example logging_info_: 

27 { 

28 "type": "snowflake", 

29 "opts": { 

30 "environment": "production" 

31 }, 

32 "access": { 

33 "credentials": { 

34 "type": "vault", 

35 "opts": { 

36 "secrets_engine": "prod-secrets", 

37 "domain": "data", 

38 "secret_path": "snowflake/service_account/logger" 

39 } 

40 } 

41 :return: None. 

42 """ 

43 

44 # Set database/schema/table names/paths 

45 if isinstance(logging_info_, dict): 

46 self.logging_info = logging_info_ 

47 else: 

48 raise Exception("The logging info given does not appear to be in a dict format.") 

49 self.endpoint_uuids_logged = [] 

50 

51 if self.logging_info["opts"]["environment"] == 'development': 

52 self.database = LoggerEnums.DefaultDevLoggingTableLocation.DATABASE_NAME.value 

53 self.schema = LoggerEnums.DefaultDevLoggingTableLocation.SCHEMA_NAME.value 

54 elif self.logging_info["opts"]["environment"] == 'production': 

55 self.database = LoggerEnums.DefaultProdLoggingTableLocation.DATABASE_NAME.value 

56 self.schema = LoggerEnums.DefaultProdLoggingTableLocation.SCHEMA_NAME.value 

57 

58 if self.logging_info.get("opts", {}).get("database"): 

59 self.database = self.logging_info["database"] 

60 if self.logging_info.get("opts").get("schema"): 

61 self.schema = self.logging_info["schema"] 

62 

63 self.schema_path = f'"{self.database}"."{self.schema}"' 

64 self.base_table = LoggerEnums.LoggingTables.BASE_TABLE.value["TABLE_NAME"] 

65 self.events_table = LoggerEnums.LoggingTables.EVENTS_TABLE.value["TABLE_NAME"] 

66 self.base_table_path = f'{self.schema_path}."{self.base_table}"' 

67 self.events_table_path = f'{self.schema_path}."{self.events_table}"' 

68 

69 # Set Logger attributes 

70 self.make_incremental_cols = [] 

71 if self.logging_info["opts"].get("processing") is not None: 

72 self.processing_type = self.logging_info["opts"]["processing"]["type"] 

73 self.chunk_size = self.logging_info["opts"]["processing"]["opts"].get("chunk_size") 

74 self.ignore_force_merge = self.logging_info["opts"]["processing"]["opts"]["ignore_force_merge"] 

75 else: 

76 self.processing_type = None 

77 

78 # Generate column requirements/mapping for insert statements 

79 self.required_on_insert = self.__generate_requirements_dict("REQUIRED_ON_INSERT") 

80 self.required_for_uuid = self.__generate_requirements_dict("REQUIRED_FOR_UUID") 

81 self.__generate_endpoint_detail_table_mapping() 

82 

83 # Get Snowflake credentials and connect. 

84 self.__grab_credentials() 

85 self.__connect() 

86 

87 def __grab_credentials(self) -> None: 

88 """ 

89 Grabs the credentials from the desired secrets vault via what's specified in the logging info. 

90 """ 

91 

92 self.credentials = SDCEndpointCredentials.get_credentials(self.logging_info["access"]["credentials"]) 

93 

94 def __connect(self) -> None: 

95 """ 

96 Connects to Snowflake and returns a Snowflake handle to self.client. 

97 :return: None. 

98 """ 

99 

100 self.client = SnowflakeDatabase() 

101 try: 

102 self.client.connect( 

103 warehouse_=self.credentials["warehouse"], 

104 database_=self.database, 

105 schema_=self.schema, 

106 role_=self.credentials["role"], 

107 user_=self.credentials["username"], 

108 password_=self.credentials["password"], 

109 account_=self.credentials["account"] 

110 ) 

111 except Exception as e: 

112 logging.error(f"There was an issue connecting to the SDCSnowflakeLogger tables: {e}") 

113 raise 

114 

115 self.client.execute_query("ALTER SESSION SET TIMEZONE = 'UTC';") 

116 

117 def generate_uuid(self, data_): 

118 """ 

119 Generates a unique identifier from a set of data. 

120 

121 :param data_: Dict. Data to generate a UUID from. 

122 :return: String. UUID. 

123 """ 

124 

125 data = {} 

126 for key, value in sorted(data_.items(), key=lambda x: x[0].upper()): 

127 if value: 

128 data[key.upper()] = value 

129 

130 data = "|".join([str(v).strip().upper() for k, v in data.items()]) 

131 uuid = hashlib.md5(str(data).upper().encode()).hexdigest() 

132 

133 return uuid 

134 

135 def __generate_endpoint_detail_table_mapping(self): 

136 """ 

137 Generates the following lookup dictionaries which map data sets to the appropriate endpoint (Ex. S3) tables: 

138 - self.endpoint_detail_table_mapping: Mapping to an endpoint details table (Ex. LOGGING_S3_ENDPOINTS) 

139 - self.endpoint_write_table_mapping: Mapping to an endpoint writes table (Ex. LOGGING_S3_WRITES) 

140 :return: None. 

141 """ 

142 

143 self.endpoint_detail_table_mapping = {} 

144 for item in LoggerEnums.LoggingTables["ENDPOINT_DETAIL_TABLES"].value: 

145 self.endpoint_detail_table_mapping[item["TYPE"]] = item 

146 self.endpoint_write_table_mapping = {} 

147 for item in LoggerEnums.LoggingTables["ENDPOINT_WRITE_TABLES"].value: 

148 self.endpoint_write_table_mapping[item["TYPE"]] = item 

149 

150 def __generate_requirements_dict(self, requirement_): 

151 """ 

152 Generates table requirement dictionaries that assist with determining such things as: 

153 - What columns are required to insert into a specific table. 

154 - What columns are used to generate a UUID for a table. 

155 :param requirement_: The type of requirement to build a requirements dictionary for. These can include 

156 any boolean-type parameter in a table data schema from SDCSnowflakeLoggerEnums 

157 Examples: 

158 - "REQUIRED_ON_INSERT": Columns required to be passed in to make an insert to a table. 

159 - "REQUIRED_FOR_UUID": Columns that are used to generate a records UUID column. 

160 

161 Example of generated self.required_on_insert: 

162 

163 { 

164 'LOGGING_EXCHANGES': 

165 ['SCHEMA_NAME', 'DATA_SOURCE_TAG', 'DATA_SINK_TAG', 'DATA_ITEM_NAME', 'SINK_ENDPOINT_TYPE'], 

166 'LOGGING_S3_ENDPOINT': 

167 ['BUCKET', 'PREFIX] 

168 'LOGGING_SNOWFLAKE_ENDPOINT': 

169 ['ACCOUNT', 'DATABASE', 'SCHEMA', 'TABLE_NAME'] 

170 } 

171 """ 

172 

173 requirements_dict = {} 

174 

175 for key, value in LoggerEnums.LoggingTables.__members__.items(): 

176 if isinstance(value.value, list): 

177 for item in value.value: 

178 requirements_dict[item["TABLE_NAME"]] = \ 

179 [key for key, val in item["DATA_SCHEMA"].__members__.items() 

180 if requirement_ in val.value and val.value[requirement_]] 

181 else: 

182 requirements_dict[value.value["TABLE_NAME"]] = \ 

183 [key for key, val in value.value["DATA_SCHEMA"].__members__.items() 

184 if requirement_ in val.value and val.value[requirement_]] 

185 

186 return requirements_dict 

187 

188 def __clean_record(self, record_): 

189 """ 

190 Cleans up a record by performing the following functions: 

191 - Stripping whitespace from either side of value 

192 

193 :param record_: Dict. Record to be cleaned-up. Only keys with String values will be cleaned. 

194 :return: Dict. Cleaned record. 

195 """ 

196 

197 for item, value in record_.items(): 

198 if isinstance(value, str): 

199 record_[item] = value.strip() 

200 

201 return record_ 

202 

203 def __generate_merge_query(self, stats_, table_, data_schema_, join_on_=[]): 

204 """ 

205 Generates a merge query for a logger table. Records passed in via stats_ will have their keys/columns 

206 verified against the table's data schema. Columns not part of the schema will be ignored. 

207 

208 :param stats_: Dict. Record to be merged into table. 

209 :param table_: String. Table name to merge into. 

210 :param data_schema_: Enum. Data schema for the table being merged into. 

211 :param join_on_: List. Column names to join on (between the record and the table). 

212 :return: String. Merge query that can then be run against Snowflake. 

213 """ 

214 

215 select_column_clause = "" 

216 from_values_clause = "" 

217 matched_clause = "" 

218 not_matched_clause = "" 

219 values_clause = "" 

220 join_on_clause = " AND ".join([f"t1.{x} = t2.{x}" for x in join_on_]) 

221 col_count = 1 

222 

223 for col_name, col_value in stats_.items(): 

224 if col_name in data_schema_.__members__: 

225 select_column_clause += f"column{col_count} as {col_name}, " 

226 col_count += 1 

227 matched_clause += f't1.{col_name} = t2.{col_name}, ' 

228 not_matched_clause += f'{col_name}, ' 

229 values_clause += f'{col_name}, ' 

230 if data_schema_[col_name].value["TYPE"] in self.client.non_quotable_datatypes: 

231 from_values_clause += f"{col_value}, " 

232 else: 

233 from_values_clause += f"'{col_value}', " 

234 

235 merge_query = f""" 

236 MERGE INTO {self.schema_path}."{table_}" t1 USING ( 

237 SELECT {select_column_clause[:-2]} 

238 FROM VALUES ({from_values_clause[:-2]})) t2 

239 ON {join_on_clause} 

240 

241 WHEN MATCHED THEN UPDATE SET {matched_clause[:-2]} 

242 

243 WHEN NOT MATCHED THEN INSERT ({not_matched_clause[:-2]}) 

244 VALUES ({values_clause[:-2]}); 

245 """ 

246 

247 return merge_query 

248 

249 def setup_logging_environment(self): 

250 """ 

251 Creates all the logging tables needed in logger environment. Tables are generated from the information contained 

252 in SDCSnowflakeLoggerEnums.LoggingTables. 

253 :return: None. 

254 """ 

255 

256 #TODO: This function would be considered an "admin" logger action. We should eventually move this to a separate 

257 # class that can also control stuff like altering tables (add new columns), manually update log records, etc. 

258 

259 tables = [] 

260 for key, value in LoggerEnums.LoggingTables.__members__.items(): 

261 if isinstance(value.value, list): 

262 tables.extend(value.value) 

263 else: 

264 tables.append(value.value) 

265 

266 create_statement = 'CREATE TABLE' 

267 

268 for table in tables: 

269 create_query = f'{create_statement} {self.schema_path}."{table["TABLE_NAME"]}" (' 

270 

271 for column, datatype in table["DATA_SCHEMA"].__members__.items(): 

272 if "CONSTRAINTS" in datatype.value and datatype.value["CONSTRAINTS"]: 

273 contstraint_statement = datatype.value["CONSTRAINTS"] 

274 else: 

275 contstraint_statement = "" 

276 create_query += f'"{column}" {datatype.value["TYPE"]} {contstraint_statement},' 

277 sql_query = create_query[:-1] + ");" 

278 

279 self.client.execute_query(sql_query) 

280 

281 def generate_endpoint_uuid(self, endpoint_schema_, log_=False): 

282 """ 

283 Generates an endpoint detail UUID from an SDCDataSchema endpoint. 

284 :param endpoint_schema_: Dict. SDCDataSchema endpoint. 

285 :param log_: Boolean. If True, generated endpoint UUID (and details) will be merged into it's associated 

286 endpoint table. Default = False. 

287 :return: String. Generated endpoint detail UUID. 

288 """ 

289 

290 record = self.__clean_record(endpoint_schema_["info"]) 

291 

292 endpoint_type = record["type"].upper() 

293 endpoint_table = self.endpoint_detail_table_mapping[endpoint_type]["TABLE_NAME"] 

294 

295 uuid_ident = {} 

296 for item in self.required_for_uuid[endpoint_table]: 

297 uuid_ident[item] = endpoint_schema_["info"]["access"][item.lower()].strip() 

298 uuid_ident["UUID"] = self.generate_uuid(uuid_ident) 

299 

300 if log_: 

301 self.update_endpoint_table(endpoint_schema_, uuid_ident) 

302 

303 return uuid_ident["UUID"] 

304 

305 def update_endpoint_table(self, endpoint_schema_, stats_): 

306 """ 

307 Updates and endpoint table with a new record. 

308 

309 :param endpoint_schema_: Dict. SDCDataSchema endpoint. This is used to determine the appropriate 

310 endpoint detail table to write to. 

311 :param stats_: Dict. Record to be merged into the associated endpoint table. 

312 Example of stats 

313 :return: None. 

314 

315 """ 

316 

317 # Determine endpoint table name and data schema from endpoint schema 

318 table = self.endpoint_detail_table_mapping[endpoint_schema_["info"]["type"].upper()]["TABLE_NAME"] 

319 data_schema = self.endpoint_detail_table_mapping[endpoint_schema_["info"]["type"].upper()]["DATA_SCHEMA"] 

320 

321 # Generate merge query and update endpoint table 

322 merge_query = self.__generate_merge_query(stats_, table, data_schema, join_on_=["UUID"]) 

323 self.client.execute_query(merge_query) 

324 

325 # Log the endpoint UUID so Logger knows it does not need to be logged again. 

326 self.endpoint_uuids_logged.append(stats_["UUID"]) 

327 

328 def insert_into_writes_table(self, write_stats_): 

329 """ 

330 Inserts a record into an endpoint "writes" table. The endpoint type is determined from the "TYPE" key in the 

331 passed in write_stats_ dict. write_stats_ is generated within an Endpoint class (ex. SDCS3Endpoint) when 

332 a logger is given to it. 

333 

334 :param write_stats_: Dict. Records to insert into the "writes" table. 

335 

336 Example of a write_stats_: 

337 { 

338 'TYPE': 'SNOWFLAKE', 

339 'EVENT_UUID': '68cf82f7eafc95a56106bc9b61ec8a2b', 

340 'STATS': [ 

341 {'TABLE_NAME': 'ORDERS_TEMP', 'TABLE_TYPE': 'TEMP', 'INSERTED': 319, 'UPSERTED': 0}, 

342 {'TABLE_NAME': 'ORDERS_TEMP_DEDUPED', 'TABLE_TYPE': 'DEDUPED', 'INSERTED': 319, 'UPSERTED': 0}, 

343 {'TABLE_NAME': 'ORDERS', 'TABLE_TYPE': 'MAIN', 'INSERTED': 0, 'UPSERTED': 319} 

344 ] 

345 } 

346 

347 :return: None. 

348 """ 

349 

350 table = self.endpoint_write_table_mapping[write_stats_["TYPE"].upper()]["TABLE_NAME"] 

351 data_schema = self.endpoint_write_table_mapping[write_stats_["TYPE"].upper()]["DATA_SCHEMA"] 

352 

353 column_clause = "" 

354 values_clause = "" 

355 

356 if write_stats_["STATS"]: 

357 for num, record in enumerate(write_stats_["STATS"]): 

358 record["EVENT_UUID"] = write_stats_["EVENT_UUID"] 

359 

360 cols_passed = set(list(record.keys())) 

361 missing_stats = set(self.required_on_insert[table]) - cols_passed 

362 if missing_stats: 

363 raise Exception(f"Missing {missing_stats} values for proper logging to events table.") 

364 

365 if num == 0: 

366 for col_name in record.keys(): 

367 if col_name not in data_schema.__members__: 

368 logging.info(f"Stat '{col_name}' not a valid option for logging to the writes table.") 

369 else: 

370 column_clause += f"{col_name}, " 

371 

372 values = "(" 

373 for col_name, col_value in record.items(): 

374 if data_schema[col_name].value in self.client.non_quotable_datatypes: 

375 values += f"{col_value}, " 

376 else: 

377 values += f"'{col_value}', " 

378 values_clause += values[:-2] + "), " 

379 

380 writes_query = f'INSERT INTO {self.schema_path}."{table}" ({column_clause[:-2]}) VALUES {values_clause[:-2]};' 

381 

382 # Insert into writes table. 

383 self.client.execute_query(writes_query) 

384 

385 def insert_into_events_table(self, record_): 

386 """ 

387 Inserts data to the logging events table. 

388 :param record_: Dict. Values to write to events table. The values required for the dict are in 

389 SDCSnowflakeLoggerEnums.BaseTableToEventsTableMapping. 

390 

391 Example of record_: 

392 { 

393 "UUID": "d3d2af251a51a805b0d8c51d73a9b105", 

394 "TOTAL_RECORDS_WRITTEN": 15000, 

395 "CURRENT_EXCHANGE_STATUS": ExchangeStatuses.PROCESSING.value 

396 "CURRENT_EXCHANGE_REASON": None, 

397 "EXCHANGE_MESSAGE": "Processing..." 

398 } 

399 

400 In addition: 

401 - Names will be converted from the base table column names to the event table column names. 

402 - An event date will be generated from the current UTC time. 

403 - An event UUID will be generated from what's REQUIRED_FOR_UUID from SDCSnowflakeLoggerEnums.EventsSchema. 

404 

405 :return: String. Event UUID. 

406 """ 

407 

408 record = {} 

409 

410 # Take a Insert/Update record (for base table) and convert the columns to the appropriate Event table names. 

411 for key, value in LoggerEnums.BaseTableToEventsTableMapping.__members__.items(): 

412 if value.value in record_: 

413 record[key] = record_[value.value] 

414 

415 # Assign Event Date as UTC now. 

416 record[LoggerEnums.EventsSchema.EVENT_DATE.value["NAME"]] = datetime.datetime.utcnow() 

417 uuid_ident = {} 

418 for item in self.required_for_uuid[self.events_table]: 

419 uuid_ident[item] = record[item] 

420 record[LoggerEnums.EventsSchema.UUID.value["NAME"]] = self.generate_uuid(uuid_ident) 

421 

422 # Look fo any missing columns that are required for inserting into events table. 

423 cols_passed = set(list(record.keys())) 

424 missing_stats = set(self.required_on_insert[self.events_table]) - cols_passed 

425 if missing_stats: 

426 raise Exception(f"Missing {missing_stats} values for proper logging to events table.") 

427 

428 # Construct insert query. 

429 column_clause = "" 

430 values_clause = "" 

431 for col_name, col_value in record.items(): 

432 if col_name not in LoggerEnums.EventsSchema.__members__: 

433 logging.info(f"Stat '{col_name}' not a valid option for logging to events table.") 

434 else: 

435 column_clause += f"{col_name}, " 

436 if LoggerEnums.EventsSchema[col_name].value in self.client.non_quotable_datatypes: 

437 values_clause += f"{col_value}, " 

438 else: 

439 values_clause += f"'{col_value}', " 

440 

441 events_query = f"INSERT INTO {self.events_table_path} ({column_clause[:-2]}) VALUES ({values_clause[:-2]});" 

442 

443 # Insert into events table. 

444 self.client.execute_query(events_query) 

445 

446 return record[LoggerEnums.EventsSchema.UUID.value["NAME"]] 

447 

448 def add_item_to_base_table(self, record_): 

449 """ 

450 Adds a new data item record to the base logging table and logs to the events table. 

451 :param record_: Dict. Values to log. 

452 

453 Example of record_: 

454 { 

455 "SCHEMA_NAMESPACE": "SDC-Vendor", 

456 "SCHEMA_NAME": "tracking-events", 

457 "DATA_SOURCE_TAG": "main_source", 

458 "DATA_SOURCE_TYPE": "S3", 

459 "DATA_SOURCE_ENDPOINT_UUID": "07a915c8c3637da4b86b3721744397ba", 

460 "DATA_SINK_TAG": "SDC_sink_0", 

461 "DATA_SINK_TYPE": "Snowflake", 

462 "DATA_SINK_ENDPOINT_UUID": "f912969dc8c1aa577465d59e8e4a60a2", 

463 "DATA_ITEM_NAME": "tracking-events-20200111.csv", 

464 "EXCHANGE_MESSAGE": "Logging started" 

465 } 

466 

467 In addition: 

468 - Default values for the table will be set (Such as starting current load status and 0 starting records). 

469 - An date updated date will be generated from the current UTC time. 

470 - A UUID will be generated from what's REQUIRED_FOR_UUID from SDCSnowflakeLoggerEnums.BaseTableSchema. 

471 

472 :return: String. UUID of record generated (based on values passed in + current timestamp). 

473 """ 

474 

475 # Determine if all necessary columns were passed in. 

476 stats_passed = set(list(record_.keys())) 

477 missing_stats = set(self.required_on_insert[self.base_table]) - stats_passed 

478 if missing_stats: 

479 raise Exception(f"Missing {missing_stats} stats for proper logging insert.") 

480 

481 # Clean-up record. 

482 record = self.__clean_record(record_) 

483 

484 # Set default values for loading metrics. 

485 record[LoggerEnums.BaseTableSchema.CURRENT_EXCHANGE_STATUS 

486 .value["NAME"]] = LoggerEnums.ExchangeStatuses.STARTED.value 

487 record[LoggerEnums.BaseTableSchema.CURRENT_EXCHANGE_REASON 

488 .value["NAME"]] = LoggerEnums.ExchangeStatusReasons.NONE.value 

489 record[LoggerEnums.BaseTableSchema.DATE_UPDATED.value["NAME"]] = datetime.datetime.utcnow().strftime( 

490 "%Y-%m-%d %H:%M:%S") 

491 record[LoggerEnums.BaseTableSchema.EXCHANGE_MESSAGE.value["NAME"]] = "Exchange started." 

492 

493 # Create unique identifier based on required information. 

494 uuid_ident = {} 

495 for item in self.required_for_uuid[self.base_table]: 

496 uuid_ident[item] = record[item] 

497 record[LoggerEnums.BaseTableSchema.UUID.value["NAME"]] = self.generate_uuid(uuid_ident) 

498 

499 # # Add Endpoint UUIDs to endpoint tables, if needed. 

500 # if record[LoggerEnums.BaseTableSchema.DATA_SOURCE_ENDPOINT_UUID.value["NAME"]] not in self.endpoint_uuids_logged: 

501 # self.update_endpoint_table(record, "source") 

502 # if record[LoggerEnums.BaseTableSchema.DATA_SINK_ENDPOINT_UUID.value["NAME"]] not in self.endpoint_uuids_logged: 

503 # self.update_endpoint_table(record, "sink") 

504 

505 # Generate merge query and merge into base table 

506 merge_query = self.__generate_merge_query( 

507 record, self.base_table, LoggerEnums.BaseTableSchema, join_on_=["UUID"]) 

508 self.client.execute_query(merge_query) 

509 

510 # Insert into events table. 

511 self.insert_into_events_table(record) 

512 

513 return record[LoggerEnums.BaseTableSchema.UUID.value["NAME"]] 

514 

515 def update_base_table(self, update_record_): 

516 """ 

517 Updates an existing record in the base table. Also: 

518 - Adds an event record to the events table for the change. 

519 - Adds one or more write records to the associated endpoint table. 

520 

521 :param update_record_: Dict. Values to update in log. 

522 

523 Example of record_: 

524 { 

525 "UUID": "88afc0ef995984df47025eb0f9e99cbf", 

526 "TOTAL_RECORDS_WRITTEN": 15000, 

527 "CURRENT_EXCHANGE_STATUS": LoggerEnums.ExchangeStatuses.PROCESSING.value, 

528 "CURRENT_EXCHANGE_REASON": None, 

529 "EXCHANGE_MESSAGE": "Processed another 15,000 records." 

530 } 

531 

532 :return: None. 

533 """ 

534 

535 # Check for write out stats 

536 if update_record_.get("WRITE_STATS"): 

537 if update_record_["WRITE_STATS"].get("STATS"): 

538 write_stats = update_record_.pop("WRITE_STATS") 

539 else: 

540 write_stats = None 

541 

542 # Set updated date to UTC now. 

543 update_record_[LoggerEnums.BaseTableSchema.DATE_UPDATED.value["NAME"]] = datetime.datetime.utcnow() 

544 

545 # Create update query. 

546 set_clause = "" 

547 for col_name, col_value in update_record_.items(): 

548 if col_name not in LoggerEnums.BaseTableSchema.__members__: 

549 logging.info(f"Column '{col_name}' not a valid option for logging.") 

550 else: 

551 set_clause += col_name + " = " 

552 if LoggerEnums.BaseTableSchema[col_name].value["TYPE"] in self.client.non_quotable_datatypes: 

553 if ("INCREMENTAL_ON_UPDATE" in LoggerEnums.BaseTableSchema[col_name].value 

554 and LoggerEnums.BaseTableSchema[col_name].value["INCREMENTAL_ON_UPDATE"])\ 

555 or (col_name in self.make_incremental_cols): 

556 set_clause += f"coalesce({col_name}, 0) + {col_value}, " 

557 else: 

558 set_clause += f"{col_value}, " 

559 else: 

560 set_clause += f"'{col_value}', " 

561 

562 update_query = f"""UPDATE {self.base_table_path} 

563 SET {set_clause[:-2]} 

564 WHERE UUID = '{update_record_["UUID"]}';""" 

565 

566 # Update the base table. 

567 self.client.execute_query(update_query) 

568 

569 # Insert update into the events table. 

570 event_uuid = self.insert_into_events_table(update_record_) 

571 

572 # Update write stats 

573 if write_stats: 

574 write_stats["EVENT_UUID"] = event_uuid 

575 self.insert_into_writes_table(write_stats) 

576 

577 def retrieve_successful_runs(self, data_sink_endpoint_uuid_, empty_is_success_=False): 

578 """ 

579 Retrieves a list of data times that were successfully loaded to an endpoint sink. 

580 

581 :param data_sink_endpoint_uuid_: String. UUID of endpoint to check for items against. 

582 :param empty_is_success_: Boolean. If True, data items that were previously marked as EMPTY 

583 (e.g. an S3 file that contained no data) will be considered a SUCCESS and included in this list, resulting 

584 in the item not being re-processed. 

585 :return: Names of data items that successfully synced. 

586 :rtype: list 

587 """ 

588 

589 select_query = f""" SELECT 

590 {LoggerEnums.BaseTableSchema.DATA_ITEM_NAME.value["NAME"]} 

591 FROM {self.base_table_path} 

592 WHERE {LoggerEnums.BaseTableSchema.DATA_SINK_ENDPOINT_UUID.value["NAME"]} = '{data_sink_endpoint_uuid_}' 

593 AND {LoggerEnums.BaseTableSchema.CURRENT_EXCHANGE_STATUS.value["NAME"]} = '{LoggerEnums.ExchangeStatuses.FINISHED.value}' 

594 """ 

595 

596 if empty_is_success_: 

597 select_query += f""" OR {LoggerEnums.BaseTableSchema.CURRENT_EXCHANGE_STATUS.value["NAME"]} = '{LoggerEnums.ExchangeStatuses.EMPTY.value}'""" 

598 

599 self.client.execute_query(select_query, return_results_=True) 

600 

601 data_names = [] 

602 for row in self.client.get_results(): 

603 data_names.append(row[0]) 

604 

605 return data_names 

606 

607 def retrieve_reprocess_runs(self): 

608 """ 

609 This method retrievs all logger records with status REPROCESS 

610 :return: existing records with status REPROCESS 

611 :rtype: dict 

612 """ 

613 select_query = f""" 

614 SELECT 

615 {LoggerEnums.BaseTableSchema.DATA_ITEM_NAME.value["NAME"]}, 

616 {LoggerEnums.BaseTableSchema.UUID.value["NAME"]}, 

617 coalesce({LoggerEnums.BaseTableSchema.TOTAL_RECORDS_WRITTEN.value["NAME"]}, 0) as "TOTAL_RECORDS_WRITTEN" 

618 FROM {self.base_table_path} 

619 WHERE {LoggerEnums.BaseTableSchema.CURRENT_EXCHANGE_STATUS.value["NAME"]} in ('{LoggerEnums.ExchangeStatuses.REPROCESS.value}') 

620 """ 

621 

622 self.client.execute_query(select_query, return_results_=True) 

623 

624 data_runs = dict() 

625 for row in self.client.get_results(): 

626 data_runs[row[0]] = { 

627 LoggerEnums.BaseTableSchema.UUID.value["NAME"]: row[1], 

628 LoggerEnums.BaseTableSchema.TOTAL_RECORDS_WRITTEN.value["NAME"]: row[2] 

629 } 

630 

631 return data_runs 

632 

633 def retrieve_failed_runs(self, 

634 data_sink_endpoint_uuid_, 

635 handle_abandoned_runs_=True, 

636 mins_until_abandoned_=120, 

637 include_reprocess_: bool = False): 

638 """ 

639 Retrieves a Dict that contains information about data items that failed syncing. 

640 

641 Example returned Dict: 

642 { 

643 "SDC_predictions_lpmodel_Sep-03-2019_06_00.csv": { 

644 "UUID": "874429a32a91e8ba762fb2e3c9d6c62d", 

645 "TOTAL_RECORDS_WRITTEN": 0 

646 }, 

647 "SDC_predictions_lpmodel_Sep-04-2019_00_00.csv": { 

648 "UUID": "334429a32a91e8ba762fb2e3429a32ddz", 

649 "TOTAL_RECORDS_WRITTEN": 15000 

650 } 

651 } 

652 

653 :param data_sink_endpoint_uuid_: String. UUID of endpoint to check for items against. 

654 :param handle_abandoned_runs_: Boolean. If True, will first trigger handle_abandoned_runs(), which will 

655 determine if a run that is not in a SUCCESS or FAILED status is considered ABANDONED. If ABANDONED, item 

656 will be marked as FAILED in the logs and included in the end result of retrieve_failed_runs(). 

657 :param mins_until_abandoned_: Int. For use with a True handle_abandoned_runs_. Number of minutes to old to 

658 consider an item abandoned. 

659 :include_reprocess_: Boolean flag which either includes or excludes items marked REPROCESS from the result 

660 :return: Dict. 

661 """ 

662 

663 if handle_abandoned_runs_: 

664 self.handle_abandoned_runs(data_sink_endpoint_uuid_, mins_until_abandoned_) 

665 

666 if include_reprocess_: 

667 statuses = [LoggerEnums.ExchangeStatuses.FAILED.value, LoggerEnums.ExchangeStatuses.REPROCESS.value] 

668 else: 

669 statuses = [LoggerEnums.ExchangeStatuses.FAILED.value] 

670 where_statuses = ("','".join(statuses)) 

671 

672 select_query = f""" 

673 SELECT 

674 {LoggerEnums.BaseTableSchema.DATA_ITEM_NAME.value["NAME"]}, 

675 {LoggerEnums.BaseTableSchema.UUID.value["NAME"]}, 

676 coalesce({LoggerEnums.BaseTableSchema.TOTAL_RECORDS_WRITTEN.value["NAME"]}, 0) as "TOTAL_RECORDS_WRITTEN" 

677 FROM {self.base_table_path} 

678 WHERE {LoggerEnums.BaseTableSchema.DATA_SINK_ENDPOINT_UUID.value["NAME"]} = '{data_sink_endpoint_uuid_}' 

679 AND {LoggerEnums.BaseTableSchema.CURRENT_EXCHANGE_STATUS.value["NAME"]} in ('{where_statuses}') 

680 """ 

681 

682 self.client.execute_query(select_query, return_results_=True) 

683 

684 data_runs = dict() 

685 for row in self.client.get_results(): 

686 data_runs[row[0]] = { 

687 LoggerEnums.BaseTableSchema.UUID.value["NAME"]: row[1], 

688 LoggerEnums.BaseTableSchema.TOTAL_RECORDS_WRITTEN.value["NAME"]: row[2] 

689 } 

690 

691 return data_runs 

692 

693 def retrieve_non_abandoned_runs(self, data_sink_endpoint_uuid_, mins_until_abandoned_): 

694 """ 

695 Returns a list of data item names where the status is in a non-final status but is not yet considered 

696 abandoned. 

697 :param data_sink_endpoint_uuid_: String. UUID of endpoint to check for items against. 

698 :param mins_until_abandoned_: Int. Number of minutes prior to current date time to consider an item abandoned. 

699 :return: List. Data item names. 

700 """ 

701 

702 select_query = f""" 

703 SELECT 

704 {LoggerEnums.BaseTableSchema.DATA_ITEM_NAME.value["NAME"]} 

705 FROM {self.base_table_path} 

706 WHERE {LoggerEnums.BaseTableSchema.DATA_SINK_ENDPOINT_UUID.value["NAME"]} = '{data_sink_endpoint_uuid_}' 

707 AND {LoggerEnums.BaseTableSchema.CURRENT_EXCHANGE_STATUS.value["NAME"]} in ( 

708 '{LoggerEnums.ExchangeStatuses.STARTED.value}', 

709 '{LoggerEnums.ExchangeStatuses.PROCESSING.value}') 

710 AND DATEDIFF(minute, {LoggerEnums.BaseTableSchema.DATE_UPDATED.value["NAME"]}, 

711 CURRENT_TIMESTAMP::timestamp_ntz) <= {mins_until_abandoned_}; 

712 """ 

713 

714 self.client.execute_query(select_query, return_results_=True) 

715 

716 data_names = [] 

717 for row in self.client.get_results(): 

718 data_names.append(row[0]) 

719 

720 return data_names 

721 

722 def handle_abandoned_runs(self, data_sink_endpoint_uuid_, mins_until_abandoned_): 

723 """ 

724 Determines if a data item in the log that is sitting in a non-final status (e.g. PROCESSING, STARTED) 

725 was abandoned and updates the log tables accordingly. Abandonment is determined by setting a number of minutes 

726 prior to the current datetime. If an item's last update date was prior to this, then the item will be marked as 

727 abandoned and updated to a FAILED status. 

728 :param data_sink_endpoint_uuid_: String. UUID of endpoint to check for items against. 

729 :param mins_until_abandoned_: Int. Number of minutes prior to current date time to consider an item abandoned. 

730 :return: None. 

731 """ 

732 

733 select_query = f""" 

734 SELECT 

735 {LoggerEnums.BaseTableSchema.UUID.value["NAME"]}, 

736 {LoggerEnums.BaseTableSchema.DATA_ITEM_NAME.value["NAME"]}, 

737 coalesce({LoggerEnums.BaseTableSchema.TOTAL_RECORDS_WRITTEN.value["NAME"]}, 0) as "TOTAL_RECORDS_WRITTEN", 

738 {LoggerEnums.BaseTableSchema.DATE_UPDATED.value["NAME"]} 

739 FROM {self.base_table_path} 

740 WHERE {LoggerEnums.BaseTableSchema.DATA_SINK_ENDPOINT_UUID.value["NAME"]} = '{data_sink_endpoint_uuid_}' 

741 AND {LoggerEnums.BaseTableSchema.CURRENT_EXCHANGE_STATUS.value["NAME"]} in ( 

742 '{LoggerEnums.ExchangeStatuses.STARTED.value}', 

743 '{LoggerEnums.ExchangeStatuses.PROCESSING.value}') 

744 AND DATEDIFF(minute, {LoggerEnums.BaseTableSchema.DATE_UPDATED.value["NAME"]}, 

745 CURRENT_TIMESTAMP::timestamp_ntz) > {mins_until_abandoned_}; 

746 """ 

747 

748 self.client.execute_query(select_query, return_results_=True) 

749 

750 for row in self.client.get_results(): 

751 data_item_uuid = row[0] 

752 data_item_name = row[1] 

753 msg = f"{data_item_name} assumed abandoned due to {mins_until_abandoned_} " \ 

754 f"minute(s) of inactivity. Setting to status to FAILED." 

755 logging.info(msg) 

756 self.update_base_table({ 

757 LoggerEnums.BaseTableSchema.UUID.value["NAME"]: 

758 data_item_uuid, 

759 LoggerEnums.BaseTableSchema.TOTAL_RECORDS_WRITTEN.value["NAME"]: 

760 0, 

761 LoggerEnums.BaseTableSchema.CURRENT_EXCHANGE_STATUS.value["NAME"]: 

762 LoggerEnums.ExchangeStatuses.FAILED.value, 

763 LoggerEnums.BaseTableSchema.CURRENT_EXCHANGE_REASON.value["NAME"]: 

764 LoggerEnums.ExchangeStatusReasons.ABANDONED.value, 

765 LoggerEnums.BaseTableSchema.EXCHANGE_MESSAGE.value["NAME"]: 

766 msg 

767 })