Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1""" Module for validating an SDC data schema and producing an updated schema """ 

2 

3import re 

4from collections import OrderedDict 

5from enum import Enum 

6 

7from sdc_etl_libs.sdc_data_schema import schema_enums as SchemaEnums 

8from sdc_etl_libs.sdc_data_schema.schema_exceptions import ( 

9 DataSchemaCriticalError, DataSchemaFailedValidation) 

10 

11 

12class ResultTypeStatuses(Enum): 

13 """ 

14 Statuses for validation results. 

15 """ 

16 ERROR = "ERROR" 

17 WARNING = "WARNING" 

18 UPDATE = "UPDATE" 

19 

20 

21class ResultTypeReasons(Enum): 

22 """ 

23 Reasons for validation results. 

24 """ 

25 DUPED_TAG = "DUPED_TAG" 

26 MISSING_KEY = "MISSING_KEY" 

27 DEFAULT_SET = "DEFAULT_SET" 

28 UNKNOWN_KEY = "UNKNOWN_KEY" 

29 BAD_VALUE = "BAD_VALUE" 

30 BAD_PARENT_TYPE = "BAD_PARENT_TYPE" 

31 BAD_DATA_TYPE = "BAD_DATA_TYPE" 

32 NOT_VALID = "NOT_VALID" 

33 CONFLICT = "CONFLICT" 

34 DUPLICATE_VALUE = "DUPLICATE_VALUE" 

35 EMPTY_LIST = "EMPTY_LIST" 

36 

37 

38class SchemaValidation: 

39 

40 def __init__(self): 

41 

42 # Schema attributes 

43 self.data_schema = None 

44 self.validation_type = None 

45 self.print_results = None 

46 

47 # Full data schema validation attributes 

48 self.results = [] 

49 self.ttl_errors = 0 

50 self.ttl_warnings = 0 

51 self.ttl_updates = 0 

52 self.record_section_len = 0 

53 self.record_status_len = 10 

54 self.record_reason_len = 18 

55 self.field_names = [] 

56 self.fields_with_pii = [] 

57 

58 def record_result(self, result_status_, result_reason_, section_name_, message_): 

59 """ 

60 Record a result of a data schema validation process. 

61 :param result_status_: Enum key. A status from ResultTypeStatuses. 

62 :param result_reason_: Enum key. A status from ResultTypeReasons. 

63 :param section_name_: String. Colon-separated trail of the schema section (e.g. 'top_level:endpoint:info') 

64 :param message_: String. Message for record. 

65 :return: None. 

66 """ 

67 

68 # Add record to the running set of results 

69 self.results.append({ 

70 "status": result_status_.value, 

71 "reason": result_reason_.value, 

72 "section": section_name_, 

73 "note": message_ 

74 }) 

75 

76 # This determines the longest section_name_ recorded (for use in printing out nicely in console) 

77 self.record_section_len = max(len(section_name_), self.record_section_len) 

78 

79 # Keep a count of each type of result for quick validation metrics at end 

80 if result_status_ == ResultTypeStatuses.ERROR: 

81 self.ttl_errors += 1 

82 elif result_status_ == ResultTypeStatuses.WARNING: 

83 self.ttl_warnings += 1 

84 elif result_status_ == ResultTypeStatuses.UPDATE: 

85 self.ttl_updates += 1 

86 

87 def print_validation_results(self, separate_=True): 

88 """ 

89 Prints out results of validation to the console. 

90 :param separate_: Boolean. If True, separates result records by status. If False, keeps in order of logging. 

91 :return: None. 

92 """ 

93 

94 results = self.results 

95 separator = f" {'':-^150}" 

96 

97 section_len = self.record_section_len + 3 

98 print(f" {'Schema Validation Report':-^150}") 

99 print(f' Results for "{self.schema_name}": ') 

100 if len(results) == 0: 

101 print(" ** Schema passed validation without any issues. Great job! **") 

102 print(separator) 

103 else: 

104 print(f" Schema validation resulted in {self.ttl_errors:,} error(s), {self.ttl_warnings:,} warning(s) " 

105 f"and {self.ttl_updates:,} update(s). ERRORS need to be corrected to pass validation.") 

106 print(separator) 

107 

108 print( 

109 f" {'STATUS': <{self.record_status_len}} {'REASON': <{self.record_reason_len}} {'SCHEMA SECTION': <{section_len}} {'DETAIL'}" 

110 ) 

111 print(separator) 

112 if separate_: 

113 for status in ["ERROR", "WARNING", "UPDATE"]: 

114 results = [x for x in self.results if x["status"] == status] 

115 if len(results) > 0: 

116 for result in results: 

117 print( 

118 f' {result["status"]:.<{self.record_status_len}} {result["reason"]:.<{self.record_reason_len}} ' 

119 f'{result["section"]:.<{section_len}} {result["note"]}') 

120 print(separator) 

121 else: 

122 for result in results: 

123 print( 

124 f' {result["status"]:.<{self.record_status_len}} {result["reason"]:.<{self.record_reason_len}} ' 

125 f'{result["section"]:.<{section_len}} {result["note"]}') 

126 print(separator) 

127 

128 def verify_tag_uniqueness(self): 

129 """ 

130 Checks for uniqueness of tag names. If a tag is not unique, an ERROR is logged to the result records. 

131 """ 

132 

133 tags = [] 

134 if not "endpoints" in self.data_schema: 

135 self.record_result(ResultTypeStatuses.ERROR, ResultTypeReasons.MISSING_KEY, "top_level", 

136 f'The "endpoints" section appears to be be missing. This is a required section.') 

137 self.finish_validation() 

138 else: 

139 for endpoint in self.data_schema["endpoints"]: 

140 if not "tag" in endpoint: 

141 self.record_result( 

142 ResultTypeStatuses.ERROR, ResultTypeReasons.MISSING_KEY, "top_level:endpoints", 

143 f'One or more endpoints appear to be be missing the "tag" key. This is a required for validation.' 

144 ) 

145 self.finish_validation() 

146 else: 

147 tags.append(endpoint["tag"]) 

148 for tag in set(tags): 

149 ttl_uses = tags.count(tag) 

150 if tags.count(tag) > 1: 

151 self.record_result( 

152 ResultTypeStatuses.ERROR, ResultTypeReasons.DUPED_TAG, "top_level:endpoints", 

153 f'The tag "{tag}" is used {ttl_uses} time(s). It can only be used once per schema.') 

154 

155 def verify_requirement(self, data_schema_section_, section_name_, enum_key_, enum_keys_, endpoint_typing_): 

156 """ 

157 Verifies that a key exists in the data schema if it is required. If a key that is required is missing, 

158 an ERROR message is logged to self.results. 

159 :param data_schema_section_: Dict. Section of data schema to be validated. 

160 :param section_name_: String. Colon-separated trail of the schema section (e.g. 'top_level:endpoint:info'). 

161 :param enum_key_: String. Key to be validated against enum_keys_. 

162 :param enum_keys_: Enum. Enum of expected keys for data schema section being validated. 

163 :param endpoint_typing_: String. Pipe-delimited typing for section, composed of endpoint type/destination type 

164 (e.g. "sink|snowflake", "source|api"). 

165 :return: Boolean. 

166 False = Key was required and was not found in the data schema. 

167 True = Key was required and was found in the data schema, or, key was not required 

168 """ 

169 

170 required, optional, exists, set_default = None, None, None, None 

171 enum_key_value = enum_keys_[enum_key_].value 

172 

173 # 1. CHECK IF KEY EXISTS IN DATA SCHEMA 

174 if enum_key_ in data_schema_section_: 

175 exists = True 

176 else: 

177 exists = False 

178 

179 # 2. CHECK IF KEY IS REQUIRED 

180 if isinstance(enum_key_value["required"], list): 

181 # If "required" section is a list, see if endpoint typing (e.g. 'sink|s3') is included. 

182 if endpoint_typing_ in enum_key_value["required"]: 

183 required = True 

184 else: 

185 required = False 

186 elif enum_key_value["required"] is True: 

187 required = True 

188 elif enum_key_value["required"] is False: 

189 required = False 

190 

191 # Raise exception if "required" and "optional" are both True at this point as that cannot be possible. 

192 if required is True and enum_key_value.get("optional") is True: 

193 raise DataSchemaCriticalError( 

194 f'"required" and "optional" cannot both be set to True for "{endpoint_typing_}". ' 

195 f'Check Enum {enum_keys_[enum_key_]} and correct before validating again.') 

196 

197 # 3. CHECK IF DEFAULT VALUE IS SET 

198 # Default value will be used if the key is "optional" and missing from the data schema. 

199 if "default_value" in enum_key_value: 

200 set_default = True 

201 default_value = enum_key_value["default_value"] 

202 else: 

203 set_default = False 

204 

205 # 4. CHECK IF KEY IS OPTIONAL 

206 # Check if this key is optional based on "optional" explicitly being defined: 

207 if "optional" in enum_key_value: 

208 if isinstance(enum_key_value["optional"], list): 

209 # If "optional" section is a list, see if endpoint typing (e.g. 'sink|s3') is included. 

210 if endpoint_typing_ in enum_key_value["optional"]: 

211 optional = True 

212 else: 

213 optional = False 

214 elif enum_key_value["optional"] is True: 

215 optional = True 

216 elif enum_key_value["optional"] is False: 

217 optional = False 

218 

219 # Raise exception if "required" and "optional" are both True at this point as that cannot be possible. 

220 if required is True and optional is True: 

221 raise DataSchemaCriticalError( 

222 f'"required" and "optional" cannot both be set to True for "{endpoint_typing_}". ' 

223 f'Check Enum {enum_keys_[enum_key_]} and correct before validating again.') 

224 

225 # If "optional" key is not explicitly set, then: 

226 # "optional" is False if required is already True 

227 if required: 

228 optional = False 

229 # "optional" is True is it is not set at all. This allows the for the "default_value" to be set if 

230 # it is included. 

231 elif "optional" not in enum_key_value: 

232 optional = True 

233 

234 # 5. DETERMINE REQUIREMENT 

235 # ERROR: Key is required and it is not in the data schema 

236 if required and not exists: 

237 if endpoint_typing_.split("|")[0] == 'None' or endpoint_typing_.split("|")[1] == 'None': 

238 self.record_result( 

239 ResultTypeStatuses.ERROR, ResultTypeReasons.MISSING_KEY, section_name_, 

240 f'"{enum_key_}" is missing here, and needs to be provided being validating rest of ' 

241 f'endpoint.') 

242 else: 

243 self.record_result(ResultTypeStatuses.ERROR, ResultTypeReasons.MISSING_KEY, section_name_, 

244 f'"{enum_key_}" is missing and is mandatory when endpoint is "{endpoint_typing_}".') 

245 return False 

246 # PASS: Key is required and it is in the data schema 

247 elif required and exists: 

248 # No issue, so return True so additional checks can run (check allowed values, data type, etc.) 

249 return True 

250 # ERROR: Key is not required or allowed to be optional and it exists in the data schema 

251 elif not required and not optional and exists: 

252 self.record_result( 

253 ResultTypeStatuses.ERROR, ResultTypeReasons.NOT_VALID, section_name_, 

254 f'"{enum_key_}" is not a valid key for this section when endpoint is "{endpoint_typing_}".') 

255 return False 

256 # PASS: Key is not required but is optional and it exists. Return True so additional checks can be run 

257 elif not required and optional and exists: 

258 return True 

259 # UPDATE: Key is not required, is optional and does not exist in data schema. If set_default is True, then 

260 # set default value. Return False here so subsequent checks do not run as they are not needed now. 

261 elif not required and optional and not exists and set_default: 

262 data_schema_section_[enum_key_] = default_value 

263 data_type_result = self.verify_data_type(data_schema_section_[enum_key_], section_name_, enum_key_, 

264 enum_keys_) 

265 if data_type_result is False: 

266 raise DataSchemaCriticalError( 

267 f'"{enum_key_}" has a default value set, but it violates the data type. ' 

268 f'Check Enum {enum_keys_[enum_key_]} and correct before validating again.') 

269 else: 

270 self.record_result( 

271 ResultTypeStatuses.UPDATE, ResultTypeReasons.DEFAULT_SET, section_name_, 

272 f'"{enum_key_}" was missing, but a default value was set. Added key with value ' 

273 f'"{enum_keys_[enum_key_].value["default_value"]}".') 

274 return False 

275 # PASS: Key is not required, is optional, does not exist and no default_value is set, so nothing else to do. 

276 elif not required and optional and not exists and not set_default: 

277 return False 

278 

279 def verify_data_type(self, schema_key_value_, section_name_, enum_key_, enum_keys_): 

280 """ 

281 Verifies that a key in a data schema is of an expected type. If an key does not explicitly have a required 

282 type, then it expects a string. If a key does not have the right type, an ERROR message is logged 

283 to self.results. 

284 :param schema_key_value_: Various. Value of key in data schema. 

285 :param section_name_: String. Colon-separated trail of the schema section (e.g. 'top_level:endpoint:info'). 

286 :param enum_key_: String. Key to be validated against enum_keys_. 

287 :param enum_keys_: Enum. Eum of expected keys for data schema being validated. 

288 :return: Boolean. If data type passed, returns True. If failed, returns False. 

289 """ 

290 

291 enum_key_value = enum_keys_[enum_key_].value 

292 

293 if schema_key_value_ is None or section_name_ == "top_level": 

294 return True 

295 elif "data_type" in enum_key_value: 

296 if enum_key_value["data_type"] != list and "list_value_opts" in enum_key_value.keys(): 

297 raise DataSchemaCriticalError( 

298 f'"list_value_opts" can only be used when "data_type" is "list". ' 

299 f'Check Enum {enum_keys_[enum_key_]} and correct before validating again.') 

300 if enum_key_value["data_type"] == "*" or enum_key_value.get("list_value_opts", {}).get("data_type") == "*": 

301 return True 

302 elif type(schema_key_value_) != enum_key_value["data_type"]: 

303 self.record_result(ResultTypeStatuses.ERROR, ResultTypeReasons.BAD_DATA_TYPE, section_name_, 

304 f'"{enum_key_}" is not of type {enum_key_value["data_type"]}.') 

305 return False 

306 elif enum_key_value["data_type"] == list: 

307 data_subtype = enum_key_value.get("list_value_opts", {}).get("data_type", str) 

308 for value in schema_key_value_: 

309 if type(value) != data_subtype: 

310 self.record_result(ResultTypeStatuses.ERROR, ResultTypeReasons.BAD_DATA_TYPE, section_name_, 

311 f'"{value}" within list "{enum_key_}" is not of type {data_subtype}.') 

312 elif "data_type" not in enum_key_value: 

313 if not isinstance(schema_key_value_, str): 

314 self.record_result(ResultTypeStatuses.ERROR, ResultTypeReasons.BAD_DATA_TYPE, section_name_, 

315 f'"{enum_key_}" is not of type {str}.') 

316 return False 

317 

318 def check_list_for_duplicates(self, schema_key_value_, section_name_, enum_key_): 

319 """ 

320 Checks that any data schema key mapped to a list does not contain duplicates in its list. If a duplicate is found, 

321 an ERROR message is logged. 

322 :param schema_key_value_: Various. Value of key in data schema. 

323 :param section_name_: String. Colon-separated trail of the schema section (e.g. 'top_level:endpoint:info'). 

324 :param enum_key_: String. Key to be validated against enum_keys_. 

325 :return: None. 

326 """ 

327 

328 # Convert to string first in case dictionary is passed; also avoids treating 1 and True as same value 

329 schema_values_as_strings = [str(value) for value in schema_key_value_] 

330 for value in set(schema_values_as_strings): 

331 if schema_values_as_strings.count(value) > 1: 

332 self.record_result( 

333 ResultTypeStatuses.ERROR, ResultTypeReasons.DUPLICATE_VALUE, section_name_, 

334 f'Duplicate value "{(value[0:50]+"...") if len(value) > 50 else value}" found within list "{enum_key_}". Please remove duplicates ' 

335 f'or set "allowed_duplicates" for "{enum_key_}" to "True".') 

336 

337 def verify_list_not_empty(self, schema_key_value_, section_name_, enum_key_): 

338 """ 

339 Checks that any data schema key is not mapped to an empty list. If a list is empty, an ERROR message is logged. 

340 :param schema_key_value_: Various. Value of key in data schema. 

341 :param section_name_: String. Colon-separated trail of the schema section (e.g. 'top_level:endpoint:info'). 

342 :param enum_key_: String. Key to be validated against enum_keys_. 

343 :return: None. 

344 """ 

345 

346 if not schema_key_value_: 

347 self.record_result( 

348 ResultTypeStatuses.ERROR, ResultTypeReasons.EMPTY_LIST, section_name_, 

349 f'List "{enum_key_}" is empty. Please insert values or set "allowed_empty_list" for "{enum_key_}" to "True".' 

350 ) 

351 

352 def verify_allowed_values_params(self, enum_key_, enum_keys_): 

353 """ 

354 Verifies that both required parameters ("type" & "criteria") are defined when using "allowed_values". 

355 If a parameter is missing, raises a DataSchemaCriticalError. 

356 :param enum_key_: String. Key to be validated against enum_keys_. 

357 :param enum_keys_: Enum. Enum of expected keys for data schema being validated. 

358 :return: None. 

359 """ 

360 

361 msg = f'"allowed_values" must be a dictionary with parameters for "type" and "criteria". ' \ 

362 f'Check Enum {enum_keys_[enum_key_]} and correct before validating again.' 

363 

364 if not isinstance(enum_keys_[enum_key_].value["allowed_values"], dict): 

365 raise DataSchemaCriticalError(msg) 

366 elif not enum_keys_[enum_key_].value["allowed_values"].get("type"): 

367 raise DataSchemaCriticalError(msg) 

368 elif not enum_keys_[enum_key_].value["allowed_values"].get("criteria"): 

369 raise DataSchemaCriticalError(msg) 

370 elif enum_keys_[enum_key_].value["allowed_values"]["type"] not in ["list", "range", "regex"]: 

371 raise DataSchemaCriticalError(f'"type" of "allowed_values" must be one of: "list", "range" or "regex". ' 

372 f'Check Enum {enum_keys_[enum_key_]} and correct before validating again.') 

373 

374 def verify_allowed_values_list(self, data_schema_, section_name_, enum_key_, enum_keys_): 

375 """ 

376 Verifies that a key in the data schema contains an allowed value, according to the specified list at 

377 "allowed_values"["criteria"]. 

378 If a key has a list of values, then each item in that list is verified against allowed values. 

379 If a key does not have an allowed value, an ERROR message is logged to self.results. 

380 :param data_schema_: Dict. Section of data schema to be validated. 

381 :param section_name_: String. Colon-separated trail of the schema section (e.g. 'top_level:endpoint:info'). 

382 :param enum_key_: String. Key to be validated against enum_keys_. 

383 :param enum_keys_: Enum. Enum of expected keys for data schema being validated. 

384 :return: None. 

385 """ 

386 

387 if not isinstance(enum_keys_[enum_key_].value["allowed_values"]["criteria"], list): 

388 raise DataSchemaCriticalError( 

389 f'"criteria" of "allowed_values" must be a list when "type" of "allowed_values" is "list". ' 

390 f'Check Enum {enum_keys_[enum_key_]} and correct before validating again.') 

391 elif isinstance(data_schema_[enum_key_], list): 

392 list_values = data_schema_[enum_key_] 

393 for value in list_values: 

394 if value not in enum_keys_[enum_key_].value["allowed_values"]["criteria"]: 

395 self.record_result( 

396 ResultTypeStatuses.ERROR, ResultTypeReasons.BAD_VALUE, section_name_, 

397 f'Value "{value}" within list "{enum_key_}" is not allowed. Check allowed values.') 

398 elif data_schema_[enum_key_] not in enum_keys_[enum_key_].value["allowed_values"]["criteria"]: 

399 self.record_result( 

400 ResultTypeStatuses.ERROR, ResultTypeReasons.BAD_VALUE, section_name_, 

401 f'"{enum_key_}" value of "{data_schema_[enum_key_]}" is not allowed. Check allowed values.') 

402 

403 def verify_allowed_value_range(self, data_schema_, section_name_, enum_key_, enum_keys_): 

404 """ 

405 Verifies that a key in the data schema contains an allowed value, according to the specified range at 

406 "allowed_values"["criteria"]. 

407 If a key has a list of values, then each item in that list is verified against allowed values. 

408 If a key does not have an allowed value, an ERROR message is logged to self.results 

409 :param data_schema_: Dict. Section of data schema to be validated. 

410 :param section_name_: String. Colon-separated trail of the schema section (e.g. 'top_level:endpoint:info'). 

411 :param enum_key_: String. Key to be validated against enum_keys_. 

412 :param enum_keys_: Enum. Enum of expected keys for data schema being validated. 

413 :return: None. 

414 """ 

415 

416 enum_key_value = enum_keys_[enum_key_].value 

417 

418 if not isinstance(enum_key_value["allowed_values"]["criteria"], dict): 

419 raise DataSchemaCriticalError( 

420 f'"criteria" of "allowed_values" must be a dictionary when "type" of "allowed_values" is "range". ' 

421 f'Check Enum {enum_keys_[enum_key_]} and correct before validating again.') 

422 

423 else: 

424 range_min = enum_key_value["allowed_values"]["criteria"].get("min") 

425 range_max = enum_key_value["allowed_values"]["criteria"].get("max") 

426 if range_min is None and range_max is None: 

427 raise DataSchemaCriticalError( 

428 f'At least one of the keys "min" or "max" must be defined when "type" of "allowed_values" is "range". ' 

429 f'Check Enum {enum_keys_[enum_key_]} and correct before validating again.') 

430 elif range_min is not None and range_max is not None and range_min >= range_max: 

431 raise DataSchemaCriticalError( 

432 f'"max" value must be greater than "min" value when "type" of "allowed_values" is "range". ' 

433 f'Check Enum {enum_keys_[enum_key_]} and correct before validating again.') 

434 elif enum_key_value.get("data_type") not in (int, float, list): 

435 raise DataSchemaCriticalError( 

436 f'{enum_key_} must be a numerical or list "data_type" when "type" of "allowed_values" is "range". ' 

437 f'Check Enum {enum_keys_[enum_key_]} and correct before validating again.') 

438 elif isinstance(data_schema_[enum_key_], list): 

439 if enum_key_value.get("list_value_opts", {}).get("data_type") not in (int, float): 

440 raise DataSchemaCriticalError( 

441 f'{enum_key_} must be a numerical "data_type" in "list_value_opts" when "type" of "allowed_values"' 

442 f' is "range". Check Enum {enum_keys_[enum_key_]} and correct before validating again.') 

443 else: 

444 list_values = data_schema_[enum_key_] 

445 for value in list_values: 

446 if range_min is not None and value < range_min: 

447 self.record_result( 

448 ResultTypeStatuses.ERROR, ResultTypeReasons.BAD_VALUE, section_name_, 

449 f'Value "{value}" within list "{enum_key_}" ' 

450 f'is below the minimum range of values. Check allowed value range.') 

451 elif range_max is not None and value > range_max: 

452 self.record_result( 

453 ResultTypeStatuses.ERROR, ResultTypeReasons.BAD_VALUE, section_name_, 

454 f'Value "{value}" within list "{enum_key_}" ' 

455 f'is above the maximum range of values. Check allowed value range.') 

456 elif range_min is not None and data_schema_[enum_key_] < range_min: 

457 self.record_result( 

458 ResultTypeStatuses.ERROR, ResultTypeReasons.BAD_VALUE, section_name_, 

459 f'"{enum_key_}" value of "{data_schema_[enum_key_]}" is below the minimum range of values. ' 

460 f'Check allowed value range.') 

461 elif range_max is not None and data_schema_[enum_key_] > range_max: 

462 self.record_result( 

463 ResultTypeStatuses.ERROR, ResultTypeReasons.BAD_VALUE, section_name_, 

464 f'"{enum_key_}" value of "{data_schema_[enum_key_]}" is above the maximum range of values. ' 

465 f'Check allowed value range.') 

466 

467 def verify_allowed_value_regex(self, data_schema_, section_name_, enum_key_, enum_keys_): 

468 """ 

469 Verifies that a key in the data schema contains an allowed value, according to the specified regex pattern at 

470 "allowed_values"["criteria"]. 

471 If a key has a list of values, then each item in that list is verified against allowed values. 

472 If a key does not have an allowed value, an ERROR message is logged to self.results 

473 :param data_schema_: Dict. Section of data schema to be validated. 

474 :param section_name_: String. Colon-separated trail of the schema section (e.g. 'top_level:endpoint:info'). 

475 :param enum_key_: String. Key to be validated against enum_keys_. 

476 :param enum_keys_: Enum. Enum of expected keys for data schema being validated. 

477 :return: None. 

478 """ 

479 

480 enum_key_value = enum_keys_[enum_key_].value 

481 

482 if not isinstance(enum_key_value["allowed_values"]["criteria"], str): 

483 raise DataSchemaCriticalError( 

484 f'"criteria" of "allowed_values" must be a string when "type" of "allowed_values" is "regex". ' 

485 f'Check Enum {enum_keys_[enum_key_]} and correct before validating again.') 

486 elif enum_key_value.get("data_type") not in (str, list): 

487 raise DataSchemaCriticalError( 

488 f'{enum_keys_[enum_key_]} must be a string or list "data_type" when "type" of "allowed_values" is "regex". ' 

489 f'Check Enum {enum_keys_[enum_key_]} and correct before validating again.') 

490 elif isinstance(data_schema_[enum_key_], list): 

491 if enum_key_value.get("list_value_opts", {}).get("data_type") != str: 

492 raise DataSchemaCriticalError( 

493 f'{enum_key_} must be a string "data_type" in "list_value_opts" when "type" of "allowed_values"' 

494 f' is "regex". Check Enum {enum_keys_[enum_key_]} and correct before validating again.') 

495 else: 

496 list_values = data_schema_[enum_key_] 

497 for value in list_values: 

498 if not re.search(enum_key_value["allowed_values"].get("criteria"), value): 

499 self.record_result( 

500 ResultTypeStatuses.ERROR, ResultTypeReasons.BAD_VALUE, section_name_, 

501 f'Value "{value}" within list "{enum_key_}" ' 

502 f'does not match the given regex pattern. Check allowed regex pattern.') 

503 elif not re.search(enum_key_value["allowed_values"].get("criteria"), data_schema_[enum_key_]): 

504 self.record_result( 

505 ResultTypeStatuses.ERROR, ResultTypeReasons.BAD_VALUE, section_name_, 

506 f'"{enum_key_}" value of "{data_schema_[enum_key_]}" does not match the given regex pattern. ' 

507 f'Check allowed regex pattern.') 

508 

509 def catch_nonexistent_keys(self, data_schema_, section_name_, enum_keys_): 

510 """ 

511 Finds keys that are in the data schema but not part of the schema structure and logs an error message. 

512 :param data_schema_: Dict. Section of data schema to be validated. 

513 :param section_name_: String. Colon-separated trail of the schema section (e.g. 'top_level:endpoint:info'). 

514 :param enum_keys_: Enum. Enum of expected keys for data schema being validated. 

515 :return: None. 

516 """ 

517 

518 for key, value in data_schema_.items(): 

519 if key not in enum_keys_: 

520 self.record_result(ResultTypeStatuses.ERROR, ResultTypeReasons.UNKNOWN_KEY, section_name_, 

521 f'"{key}" is an unknown key for this section.') 

522 

523 def verify_field_no_duplicate_names(self, field_names: list): 

524 """ 

525 Check for duplicate field names 

526 :param field_names: List of all field names 

527 :return: None - Logs warning 

528 """ 

529 

530 seen = [] 

531 for name in field_names: 

532 if name.upper() not in seen: 

533 seen.append(name.upper()) 

534 else: 

535 self.record_result(ResultTypeStatuses.WARNING, ResultTypeReasons.DUPLICATE_VALUE, 

536 f"top_level:fields[{name}]", f'Duplicate name in field list') 

537 

538 def verify_field_merge_key(self, field_: dict): 

539 """ 

540 

541 Validates Fields - if set as a merge key, flag if no default value set 

542 :param field_: Dict - the field attribute dictionary 

543 :return: None - Logs warning 

544 """ 

545 

546 if field_.get("sf_merge_key"): 

547 if not field_.get("default_value"): 

548 field_name = field_.get("name") 

549 self.record_result(ResultTypeStatuses.WARNING, ResultTypeReasons.MISSING_KEY, 

550 f"top_level:fields[{field_name}]", 

551 'Default_value should be set when sf_merge_key = True') 

552 

553 def capture_field_name(self, field_: dict): 

554 """ 

555 

556 Returns name value.. if rename has been set will choose that to return 

557 :param field_: Dict - the field attribute dictionary 

558 :return: field_name 

559 """ 

560 

561 if field_.get("rename"): 

562 return field_["rename"] 

563 else: 

564 return field_["name"] 

565 

566 def verify_fields(self): 

567 """ 

568 Validates field keys/attributes 

569 :return: None 

570 """ 

571 if self.data_schema.get("fields"): # Mainly not to break current test script 

572 fields = self.data_schema.get("fields") 

573 

574 # Define any collections needed when going through all fields 

575 field_names = [] 

576 

577 # iterate over fields once 

578 # Tests for 'collections' that need to go over all fields first. Add to lists needed 

579 

580 for field in fields: 

581 field_name = self.capture_field_name(field) 

582 field_names.append(field_name) 

583 if field.get("is_pii") == True: 

584 self.fields_with_pii.append(field.get("name")) 

585 

586 # Tests on single fields that don't require going through all first 

587 self.verify_field_merge_key(field) 

588 

589 # Run any collection tests 

590 self.verify_field_no_duplicate_names(field_names) 

591 

592 def verify_pii_handling(self): 

593 """ 

594 Validates "contains_pii" is set correctly to True or False according to whether there is at least one field in 

595 the "fields" section that has "is_pii" set to True. Also checks whether "table_name" ends with "PII" or "NO_PII" 

596 if endpoint is a snowflake sink. 

597 :return: None 

598 """ 

599 

600 if self.data_schema.get("fields"): 

601 contains_pii = self.data_schema.get("contains_pii") 

602 fields = self.data_schema.get("fields") 

603 endpoints = self.data_schema.get("endpoints") 

604 

605 if contains_pii == False and len(self.fields_with_pii) > 0: 

606 self.record_result( 

607 ResultTypeStatuses.ERROR, ResultTypeReasons.CONFLICT, "top_level", 

608 f'"contains_pii" is False but the following fields have the key "is_pii" set to True: {self.fields_with_pii}.' 

609 ) 

610 

611 elif contains_pii == True and len(self.fields_with_pii) == 0: 

612 self.record_result(ResultTypeStatuses.ERROR, ResultTypeReasons.CONFLICT, "top_level", 

613 f'"contains_pii" is True but no fields have the key "is_pii" set to True.') 

614 

615 for endpoint in endpoints: 

616 if endpoint["type"] == "sink": 

617 if endpoint["info"]["type"] == "snowflake": 

618 endpoint_tag = endpoint["tag"] 

619 table_name = endpoint["info"]["access"]["table_name"] 

620 ends_with_pii = re.search("(?<!_no)_pii$", table_name, re.IGNORECASE) 

621 ends_with_no_pii = re.search("NO_PII$", table_name, re.IGNORECASE) 

622 if contains_pii == True and not ends_with_pii: 

623 self.record_result( 

624 ResultTypeStatuses.WARNING, ResultTypeReasons.BAD_VALUE, 

625 f"top_level:endpoints[{endpoint_tag}]:info:access:table_name", 

626 f'Snowflake sink table name "{table_name}" should end with "_PII" when "contains_pii" is True.' 

627 ) 

628 elif contains_pii == False and not ends_with_no_pii: 

629 self.record_result( 

630 ResultTypeStatuses.WARNING, ResultTypeReasons.BAD_VALUE, 

631 f"top_level:endpoints[{endpoint_tag}]:info:access:table_name", 

632 f'Snowflake sink table name "{table_name}" should end with "NO_PII" when "contains_pii" is False.' 

633 ) 

634 elif not ends_with_pii and not ends_with_no_pii: 

635 self.record_result( 

636 ResultTypeStatuses.WARNING, ResultTypeReasons.BAD_VALUE, 

637 f"top_level:endpoints[{endpoint_tag}]:info:access:table_name", 

638 f'Snowflake sink table name "{table_name}" should end with either "_PII" or "NO_PII".') 

639 

640 def check_for_param_conflicts(self): 

641 """ 

642 Validates that there are no parameter value conflicts. If one is found, an ERROR is logged to the result records. 

643 :return: None. 

644 """ 

645 

646 if self.validation_type == "full": 

647 endpoints = self.data_schema["endpoints"] 

648 elif self.validation_type == "endpoint": 

649 endpoints = [self.data_schema] 

650 

651 # Checking at endpoint level 

652 for endpoint in endpoints: 

653 endpoint_tag = endpoint.get("tag") 

654 

655 # Ensure if snowflake logger is used, that write_filename_to_db == True 

656 if endpoint.get("info", {}).get("logger", {}).get("type") == "snowflake" \ 

657 and endpoint.get("info", {}).get("type") in ["snowflake", "dynamodb"] \ 

658 and not endpoint.get("info", {}).get("opts", {}).get("write_filename_to_db"): 

659 self.record_result(ResultTypeStatuses.ERROR, ResultTypeReasons.CONFLICT, 

660 f"top_level:endpoints[{endpoint_tag}]", 

661 f'"write_filename_to_db" must be set to True when snowflake logger is used.') 

662 

663 # Ensure if file_name_params is used, that file_name is also set 

664 if endpoint.get("info", {}).get("file_info", {}).get("opts", {}).get("file_name_params") \ 

665 and not endpoint.get("info", {}).get("file_info", {}).get("opts", {}).get("file_name"): 

666 self.record_result(ResultTypeStatuses.ERROR, ResultTypeReasons.CONFLICT, 

667 f"top_level:endpoints[{endpoint_tag}]", 

668 f'"file_name" must be set when "file_name_params" is used.') 

669 

670 # Ensure process_by_block and the logger cannot be used simultaneously 

671 if endpoint.get("info", {}).get("opts", {}).get("process_by_block", {})\ 

672 and 'logger' in str(self.data_schema): 

673 self.record_result(ResultTypeStatuses.ERROR, ResultTypeReasons.CONFLICT, 

674 f"top_level", 

675 f'"process_by_block" and "logger" cannot be used simultaneously.') 

676 

677 def verify_key_mapping(self, section_name_, section_type_, enum_key_, enum_keys_): 

678 """ 

679 Verifies a key's inclusion in a Enum mapping. 

680 :param section_type_: String. Typing for section (e.g. "s3", "csv"). 

681 :param section_name_: String. Colon-separated trail of the schema section (e.g. 'top_level:endpoint:info'). 

682 :param enum_key_: String. Key to be validated against enum_keys_. 

683 :param enum_keys_: Enum. Eum of expected keys for data schema being validated. 

684 :return: 

685 On successful key mapping: Enum. Enum of expected keys for data schema being validated. 

686 On failure: None. 

687 """ 

688 

689 try: 

690 if "key_map" in enum_keys_[enum_key_].value: 

691 keys = enum_keys_[enum_key_].value["key_map"][section_type_].value.__members__ 

692 else: 

693 keys = enum_keys_[enum_key_].value["keys"].__members__ 

694 return keys 

695 

696 except Exception as e: 

697 self.record_result(ResultTypeStatuses.ERROR, ResultTypeReasons.BAD_PARENT_TYPE, 

698 f'{section_name_}:{enum_keys_[enum_key_].name}', 

699 f'Section could not be validated due to bad parent section type. Check allowed values.') 

700 return None 

701 

702 def validate_section(self, data_schema_, section_name_, enum_keys_, section_type_=None, endpoint_typing_=None): 

703 """ 

704 Recursively validates a data schema or section of a data schema, ensuring: 

705 - Required keys are present. 

706 - Optional keys that are missing, but have a default value im the Enums, are added. 

707 - Keys have the correct data type. 

708 - Keys have an allowed value (if allowed values is set). 

709 - Items that do not exist in the Enums are logged with a ERROR. 

710 Results of schema validation will be set to self.results. 

711 

712 :param data_schema_: Dict. Section of data schema to be validated. 

713 :param section_name_: String. Colon-separated trail of the schema section (e.g. 'top_level:endpoint:info'). 

714 :param enum_keys_: Enum. Eum of expected keys for data schema being validated. 

715 :param section_type_: String. Typing for section (e.g. "s3", "csv"). 

716 :param endpoint_typing_: String. Pipe-delimited typing for section, composed of endpoint type/destination type 

717 (e.g. "sink|snowflake", "source|api"). 

718 :return: None. 

719 """ 

720 

721 section_type = data_schema_.get("type") or section_type_ 

722 if section_name_ == "endpoint": 

723 endpoint_typing = f'{data_schema_.get("type")}|{data_schema_.get("info").get("type")}' 

724 else: 

725 endpoint_typing = endpoint_typing_ 

726 

727 for enum_key in enum_keys_: 

728 if not self.verify_requirement(data_schema_, section_name_, enum_key, enum_keys_, endpoint_typing): 

729 pass 

730 else: 

731 self.verify_data_type(data_schema_[enum_key], section_name_, enum_key, enum_keys_) 

732 

733 if enum_keys_[enum_key].value.get("data_type") == list and type(data_schema_[enum_key]) == list: 

734 if not enum_keys_[enum_key].value.get("list_value_opts", {}).get("allow_duplicates", False): 

735 self.check_list_for_duplicates(data_schema_[enum_key], section_name_, enum_key) 

736 if not enum_keys_[enum_key].value.get("list_value_opts", {}).get("allow_empty_list", False): 

737 self.verify_list_not_empty(data_schema_[enum_key], section_name_, enum_key) 

738 

739 if enum_keys_[enum_key].value.get("allowed_values"): 

740 self.verify_allowed_values_params(enum_key, enum_keys_) 

741 if enum_keys_[enum_key].value["allowed_values"].get("type") == "list": 

742 self.verify_allowed_values_list(data_schema_, section_name_, enum_key, enum_keys_) 

743 elif enum_keys_[enum_key].value["allowed_values"].get("type") == "range": 

744 self.verify_allowed_value_range(data_schema_, section_name_, enum_key, enum_keys_) 

745 elif enum_keys_[enum_key].value["allowed_values"].get("type") == "regex": 

746 self.verify_allowed_value_regex(data_schema_, section_name_, enum_key, enum_keys_) 

747 

748 if "key_map" in enum_keys_[enum_key].value or "keys" in enum_keys_[enum_key].value: 

749 keys = self.verify_key_mapping(section_name_, section_type, enum_key, enum_keys_) 

750 # Check if verify_key_mapping failed 

751 if keys is None: 

752 continue 

753 # Checking here to see if enum has no keys; If none, then any schema keys passed into section will 

754 # log an error 

755 if len(keys) == 0 and len(data_schema_[enum_key]) > 0: 

756 self.catch_nonexistent_keys(data_schema_[enum_key], f'{section_name_}:{enum_key}', keys) 

757 continue 

758 if isinstance(data_schema_[enum_key], list): 

759 for no, item in enumerate(data_schema_[enum_key]): 

760 # Get endpoint specific information for required/optional processing 

761 if enum_key in ["fields"]: 

762 tag = data_schema_[enum_key][no].get("name") 

763 else: 

764 tag = data_schema_[enum_key][no].get("tag") 

765 if enum_key == "endpoints": 

766 if not data_schema_[enum_key][no].get("type"): 

767 self.record_result(ResultTypeStatuses.ERROR, ResultTypeReasons.MISSING_KEY, 

768 f"{section_name_}:{enum_key}[{tag}]", 

769 f'"type" is missing and is mandatory for an endpoint.') 

770 continue 

771 endpoint_typing = f'{data_schema_[enum_key][no]["type"]}|{data_schema_[enum_key][no]["info"]["type"]}' 

772 elif enum_key in ["file_filters"]: 

773 endpoint_typing = endpoint_typing_ 

774 

775 self.validate_section(item, f"{section_name_}:{enum_key}[{tag}]", keys, section_type, 

776 endpoint_typing) 

777 

778 if isinstance(data_schema_[enum_key], dict): 

779 self.validate_section(data_schema_[enum_key], f"{section_name_}:{enum_key}", keys, section_type, 

780 endpoint_typing) 

781 

782 self.catch_nonexistent_keys(data_schema_, section_name_, enum_keys_) 

783 

784 def finish_validation(self): 

785 """ 

786 Trigger to finish validation and print out results and return new data schema. 

787 :return: If there are no errors, a new data schema is returned. 

788 If there are errors, DataSchemaFailedValidation is raised and nothing is returned. 

789 """ 

790 

791 if self.print_results: 

792 self.print_validation_results() 

793 

794 if self.ttl_errors > 0: 

795 raise DataSchemaFailedValidation(self.ttl_errors) 

796 else: 

797 formatted_schema = OrderedDict() 

798 if self.validation_type == "full": 

799 keys = SchemaEnums.TopLevelKeys.__members__ 

800 elif self.validation_type == "endpoint": 

801 keys = SchemaEnums.EndpointsKeys.__members__ 

802 for key in keys: 

803 if key in self.data_schema: 

804 formatted_schema[key] = self.data_schema[key] 

805 return dict(formatted_schema) 

806 

807 def validate_schema(self, data_schema_, validation_type_="full", print_results_=True): 

808 """ 

809 Wrapper for validate_section() which starts with a data schema and prints out the validation results 

810 (from self.results). Updated schema will be found at self.data_schema. 

811 Exception is raised if an ERROR is detected. WARNING and UPDATE logs do not cause an exception. 

812 :param data_schema_: Dict. Data schema to be validated. 

813 :param validation_type_: String. Type of validation to run. Options: 

814 - "full": An entire data schema is being passed in for validation. (Default) 

815 - "endpoint": A single endpoint section is being passed in for validation. 

816 :return: Dict. Updated data schema. 

817 """ 

818 

819 self.data_schema = data_schema_ 

820 self.validation_type = validation_type_ 

821 self.print_results = print_results_ 

822 

823 if self.validation_type == "full": 

824 self.schema_name = f'{self.data_schema["namespace"]}-{self.data_schema["name"]}' 

825 self.verify_tag_uniqueness() 

826 self.validate_section(self.data_schema, "top_level", SchemaEnums.TopLevelKeys.__members__, None, None) 

827 self.check_for_param_conflicts() 

828 self.verify_fields() 

829 self.verify_pii_handling() 

830 elif self.validation_type == "endpoint": 

831 self.schema_name = f'{self.data_schema["tag"]}' 

832 self.validate_section(self.data_schema, "endpoint", SchemaEnums.EndpointsKeys.__members__, None, None) 

833 self.check_for_param_conflicts() 

834 else: 

835 raise Exception(f'"{self.validation_type}" validation_type_ is not valid for validate_schema') 

836 

837 validated_schema = self.finish_validation() 

838 return validated_schema