Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1""" SDC Data Validation module """ 

2 

3import logging 

4from sdc_etl_libs.database_helpers.DatabaseFactory import DatabaseFactory 

5from sdc_etl_libs.sdc_data_validation.data_validation_toolbox import DataValidationToolbox 

6from sdc_etl_libs.sdc_credentials.sdc_endpoint_credentials import SDCEndpointCredentials 

7from sdc_etl_libs.sdc_data_validation.data_validation_test_factory import DataValidationTestFactory 

8from sdc_etl_libs.sdc_data_validation.data_validation_test_enums import ValidationTests 

9from sdc_etl_libs.sdc_data_validation.data_validation_enums import ValidationTestTypes, ValidationRunResultStatus, ValidationRunResultReason 

10from sdc_etl_libs.sdc_data_validation.data_validation_exceptions import SDCDataValidationError, SDCDataValidationSetupError 

11 

12 

13class SDCDataValidation: 

14 

15 def __init__(self, data_schema_=None, endpoint_schema_=None, job_configuration_=None): 

16 """ 

17 Constructor for SDC Data Validation. Either a SDC data schema and SDC endpoint schema needs to be passed, or, an 

18 SDC job configuration. Information on how to setup the validator will be parsed from what is provided. 

19 :param data_schema_: Dict. Full SDC Data Schema. Optional. Default = None. 

20 :param endpoint_schema_: Dict. Endpoint SDC Data Schema. Optional. Default = None. 

21 :param job_configuration_: Job Configuration information. Optional. Default = None. 

22 """ 

23 

24 self.data_schema = data_schema_ 

25 self.endpoint_schema = endpoint_schema_ 

26 self.job_configuration = job_configuration_ 

27 

28 self.check_parameters() 

29 

30 self.database_connection_needed = False 

31 self.total_results = [] 

32 self.total_passes = 0 

33 self.total_failures = 0 

34 self.total_warnings = 0 

35 

36 self.recent_results = [] 

37 self.recent_passes = 0 

38 self.recent_failures = 0 

39 self.recent_warnings = 0 

40 

41 self.data_tests = None 

42 self.total_data_tests = 0 

43 self.completed_data_tests = 0 

44 self.log = None 

45 self.database_info = {} 

46 self.database_client = None 

47 

48 self.parse_setup_info_for_validator() 

49 

50 if self.database_connection_needed: 

51 self.connect_to_client() 

52 self.database_info["database_client"] = self.database_client 

53 

54 def check_parameters(self): 

55 """ 

56 Checks to ensure that the proper values were passed in to parse data validation info from. 

57 """ 

58 

59 if self.data_schema is None and self.endpoint_schema is None and self.job_configuration is None: 

60 raise SDCDataValidationSetupError("SDCDataValidation requires either a data schema and endpoint schema, " 

61 "or, a job configuration file to be passed in to parse validation specs " 

62 "from.") 

63 elif self.job_configuration is None and (self.data_schema is None or self.endpoint_schema is None): 

64 raise SDCDataValidationSetupError("SDCDataValidation appears to be using a data schema for validation " 

65 "specs, but, either the full data schema or endpoint schema is missing.") 

66 

67 def is_database_connection_needed(self): 

68 """ 

69 Scans data tests to determine if a database connection is needed to conduct any of the tests. If yes to any, 

70 then self.database_connection_needed will be set to True. 

71 """ 

72 

73 for test in self.data_tests: 

74 for test_name, _ in test.items(): 

75 requires_database_conn = ValidationTests.__members__[test_name].value.get("requires_database_conn") 

76 if self.endpoint_schema["type"] in requires_database_conn: 

77 self.database_connection_needed = True 

78 break 

79 

80 def parse_setup_info_for_validator(self): 

81 """ 

82 Extracts info related to validation from either an SDC Data Schema, or, SDC Job Configuration. 

83 :return: None. 

84 """ 

85 

86 if self.data_schema and self.endpoint_schema: 

87 

88 self.data_tests = DataValidationToolbox.convert_data_schema_to_validation_config( 

89 data_schema_=self.data_schema, endpoint_schema_=self.endpoint_schema) 

90 

91 self.is_database_connection_needed() 

92 

93 self.log = self.endpoint_schema["info"]["validation"]["log"] 

94 

95 if self.database_connection_needed: 

96 self.database_info["database_type"] = self.endpoint_schema["info"]["type"] 

97 self.database_info["database_name"] = self.endpoint_schema["info"]["access"]["database"] 

98 self.database_info["schema_name"] = self.endpoint_schema["info"]["access"]["schema"] 

99 self.database_info["table_name"] = self.endpoint_schema["info"]["access"]["table_name"] 

100 self.database_info["account"] = self.endpoint_schema["info"]["access"].get("account") 

101 self.database_info["credentials"] = self.endpoint_schema["info"]["validation"]["access"]["credentials"] 

102 

103 elif self.job_configuration: 

104 self.data_tests = DataValidationToolbox.convert_job_configuration_code_to_validation_config() 

105 raise NotImplementedError() 

106 

107 else: 

108 raise Exception("SDCDataValidation requires either data_schema_ and endpoint_schema_ " 

109 "be passed, or, job_configuration_.") 

110 

111 self.total_data_tests = len(self.data_tests) 

112 

113 def connect_to_client(self): 

114 """ 

115 Makes connection to database client to run queries against. Database client is set to class's 

116 "database_client" attribute. 

117 :return: None. 

118 """ 

119 

120 credentials = SDCEndpointCredentials.get_credentials(self.database_info["credentials"]) 

121 

122 # TODO: Need some uniformity in database connections so that we don't require an if statement here 

123 if self.database_info["database_type"] == "snowflake": 

124 self.database_client = DatabaseFactory.get_database(database_name_=self.database_info["database_type"]) 

125 self.database_client.connect( 

126 credentials["warehouse"], 

127 self.database_info["database_name"], 

128 self.database_info["schema_name"], 

129 credentials["role"], 

130 credentials["username"], 

131 credentials["password"], 

132 self.database_info["account"] 

133 ) 

134 self.database_client.execute_query("ALTER SESSION SET TIMEZONE = 'UTC';") 

135 else: 

136 raise NotImplementedError(f"Database type of '{self.database_info['database_type']}' is currently " 

137 f"not supported by SDCDataValidation") 

138 

139 def execute_data_tests(self, run_against_, test_types_, sdc_dataframe_=None, output_results_=False): 

140 """ 

141 Runs the data test against the data. 

142 :param run_against_: String. Either "database" or "dataframe" to denote the data type being tested. 

143 :param test_types_: List. Types of tests to run (from sdc_data_validation.data_validation_enums.ValidationTestTypes). 

144 Test types are assigned to a test in sdc_data_validation.validation_tests.validation_test_enums. 

145 :param sdc_dataframe_: SDCDataframe. The dataframe to check data against (if run_against_="dataframe") 

146 :param output_results_: Boolean. If True, results will be printed to the console. Default = False. 

147 :return: None. 

148 """ 

149 

150 self.recent_passes = 0 

151 self.recent_failures = 0 

152 self.recent_warnings = 0 

153 self.recent_results = [] 

154 

155 for data_test in self.data_tests: 

156 for test_name, test_opts in data_test.items(): 

157 if test_name in ValidationTests.__members__: 

158 if ValidationTests[test_name].value["test_type"] in test_types_: 

159 test_number = self.completed_data_tests + 1 

160 logging.info(f"Running data validation test '{test_name}' against '{run_against_}'. " 

161 f"(Test {test_number:,} of {self.total_data_tests:,}).") 

162 test = DataValidationTestFactory.setup_validation_test( 

163 test_number_=test_number, 

164 run_against_=run_against_, 

165 validation_test_info_=data_test, 

166 sdc_dataframe_=sdc_dataframe_, 

167 database_info_=self.database_info, 

168 data_schema_=self.data_schema, 

169 endpoint_schema_=self.endpoint_schema 

170 ) 

171 results = test.run_validation_test() 

172 self.update_results(results) 

173 else: 

174 raise SDCDataValidationUnknownTest(test_name) 

175 

176 if output_results_: 

177 self.output_validation_results(results_type_="recent") 

178 

179 def update_results(self, results_): 

180 """ 

181 Updates the overall validation results with data test results. 

182 :param results_: List of Dicts. Results from a data test. 

183 :return: None. 

184 """ 

185 

186 for result in results_: 

187 if result["status"] == ValidationRunResultStatus.FAIL.value: 

188 self.total_failures += 1 

189 self.recent_failures += 1 

190 elif result["status"] == ValidationRunResultStatus.PASS.value: 

191 self.total_passes += 1 

192 self.recent_passes += 1 

193 elif result["status"] == ValidationRunResultStatus.WARN.value: 

194 self.total_warnings += 1 

195 self.recent_warnings += 1 

196 

197 self.total_results.extend(results_) 

198 self.recent_results.extend(results_) 

199 self.completed_data_tests += 1 

200 

201 def finish_validation(self): 

202 """ 

203 Place-holder for wrap up tasks once data validation is completed. 

204 """ 

205 

206 pass 

207 

208 def output_validation_results(self, results_type_="all"): 

209 """ 

210 Outputs the results of validation to the console. 

211 :param results_type_: String. The type of result records to include in the output. Options include: 

212 - "all": All results for all test types run so far (from self.total_results). 

213 - "recent": Results for the last block of test types run (from self.recent_results). 

214 Default = "all" 

215 :return: None. 

216 """ 

217 

218 if results_type_ == "all": 

219 results = self.total_results 

220 passes = self.total_passes 

221 warnings = self.total_warnings 

222 failures = self.total_failures 

223 elif results_type_ == "recent": 

224 results = self.recent_results 

225 passes = self.recent_passes 

226 warnings = self.recent_warnings 

227 failures = self.recent_failures 

228 

229 separator = f" {'':-^150}" 

230 

231 logging.info(f" {'Data Validation Report':-^150}") 

232 logging.info(f" Data validation resulted in {passes:,} pass(es), {failures:,} failure(s) and {warnings:,} warning(s).") 

233 logging.info(separator) 

234 

235 logging.info( 

236 f" {'NO.': <{5}} " 

237 f"{'TEST NAME': <{20}} " 

238 f"{'STATUS': <{8}} " 

239 f"{'REASON': <{25}} " 

240 f"{'DETAIL'}" 

241 ) 

242 logging.info(separator) 

243 for status in ["FAIL", "WARN", "PASS"]: 

244 results = [x for x in results if x["status"] == status] 

245 if len(results) > 0: 

246 for result in results: 

247 logging.info( 

248 f' {result["test_number"]:.<{5}} ' 

249 f'{result["test_name"]:.<{20}} ' 

250 f'{result["status"]:.<{8}} ' 

251 f'{result["reason"]:.<{25}} ' 

252 f'{result["message"]}') 

253 logging.info(separator)