Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1 

2import os 

3import sys 

4import pytest 

5sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../../../") 

6import sdc_etl_libs 

7from sdc_etl_libs.sdc_data_validation.data_validation import SDCDataValidation 

8from sdc_etl_libs.sdc_data_validation.data_validation_exceptions import SDCDataValidationError, SDCDataValidationSetupError 

9 

10 

11def test_check_parameters(mocker): 

12 

13 mocker.patch('sdc_etl_libs.sdc_data_validation.data_validation.SDCDataValidation.connect_to_client', return_value=True) 

14 

15 data_schema = { 

16 "namespace": "validation-test", 

17 "name": "validation-test", 

18 "type": "object", 

19 "country_code": "USA", 

20 "estimated_row_size": "64b", 

21 "estimated_row_count": 100, 

22 "endpoints": [ 

23 { 

24 "type": "sink", 

25 "tag": "SDC_sink_0", 

26 "info": { 

27 "type": "snowflake", 

28 "access": { 

29 "account": "EXAMPLE", 

30 "database": "DATAENGINEERING", 

31 "table_name": "EXAMPLE", 

32 "schema": "EXAMPLE", 

33 "credentials": { 

34 "type": "aws_secrets", 

35 "opts": {"name": "snowflake/service_account/airflow"} 

36 } 

37 }, 

38 "opts": { 

39 "upsert": False, 

40 "dedupe": False, 

41 "style": "snowflake" 

42 }, 

43 "validation": { 

44 "log": True, 

45 "access": { 

46 "credentials": { 

47 "type": "aws_secrets", 

48 "opts": {"name": "snowflake/service_account/airflow"} 

49 } 

50 }, 

51 "tests": [ 

52 {"type": "column_unique"}, 

53 {"type": "column_check"}, 

54 {"type": "column_not_null"} 

55 ] 

56 } 

57 } 

58 } 

59 ], 

60 "fields": [ 

61 { 

62 "name": "COL1", 

63 "type": {"type": "string", "logical_type": "datetime"}, 

64 "constraints": [ 

65 {"type": "column_not_null"} 

66 ] 

67 }, 

68 { 

69 "name": "COL2", 

70 "type": {"type": "int"}, 

71 "constraints": [ 

72 {"type": "column_not_null"} 

73 ] 

74 }, 

75 { 

76 "name": "COL3", 

77 "type": {"type": "int"}, 

78 "constraints": [ 

79 {"type": "column_not_null"}, 

80 {"type": "column_unique", "opts": {"case_sensitive": True}} 

81 ] 

82 }, 

83 { 

84 "name": "COL4", 

85 "type": {"type": "int"} 

86 } 

87 ] 

88 } 

89 endpoint_schema = data_schema["endpoints"][0] 

90 

91 # Testing that class initializes with a data schema and endpoint schema 

92 validation = SDCDataValidation(data_schema_=data_schema, endpoint_schema_=endpoint_schema, job_configuration_=None) 

93 

94 assert validation 

95 

96 # Testing that class initializes with a job configuration, but, will raise NotImplementedError since it's not supported yet 

97 with pytest.raises(NotImplementedError): 

98 validation = SDCDataValidation(data_schema_=None, endpoint_schema_=None, job_configuration_="example") 

99 

100 # Testing that class raises SDCDataValidationSetupError if endpoint schema is missing with schema setup 

101 with pytest.raises(SDCDataValidationSetupError): 

102 validation = SDCDataValidation(data_schema_=data_schema, endpoint_schema_=None, job_configuration_=None) 

103 

104 # Testing that class raises SDCDataValidationSetupError if data schema is missing with schema setup 

105 with pytest.raises(SDCDataValidationSetupError): 

106 validation = SDCDataValidation(data_schema_=None, endpoint_schema_=endpoint_schema, job_configuration_=None) 

107 

108 # Testing that class raises SDCDataValidationSetupError if schemas or job configuration is not passed 

109 with pytest.raises(SDCDataValidationSetupError): 

110 validation = SDCDataValidation(data_schema_=None, endpoint_schema_=None, job_configuration_=None) 

111 

112 

113def test_parsing_data_schema_happy_path(mocker): 

114 """ 

115 Testing that, given a data schema and endpoint schema, the data tests are properly parsed and set into the 

116 Validator attribute. 

117 """ 

118 

119 data_schema = { 

120 "namespace": "validation-test", 

121 "name": "validation-test", 

122 "type": "object", 

123 "country_code": "USA", 

124 "estimated_row_size": "64b", 

125 "estimated_row_count": 100, 

126 "endpoints": [ 

127 { 

128 "type": "sink", 

129 "tag": "SDC_sink_0", 

130 "info": { 

131 "type": "snowflake", 

132 "access": { 

133 "account": "EXAMPLE", 

134 "database": "DATAENGINEERING", 

135 "table_name": "EXAMPLE", 

136 "schema": "EXAMPLE", 

137 "credentials": { 

138 "type": "aws_secrets", 

139 "opts": {"name": "snowflake/service_account/airflow"} 

140 } 

141 }, 

142 "opts": { 

143 "upsert": False, 

144 "dedupe": False, 

145 "style": "snowflake" 

146 }, 

147 "validation": { 

148 "log": True, 

149 "access": { 

150 "credentials": { 

151 "type": "aws_secrets", 

152 "opts": {"name": "snowflake/service_account/airflow"} 

153 } 

154 }, 

155 "tests": [ 

156 {"type": "column_unique"}, 

157 {"type": "column_check"}, 

158 {"type": "column_not_null"} 

159 ] 

160 } 

161 } 

162 } 

163 ], 

164 "fields": [ 

165 { 

166 "name": "COL1", 

167 "type": {"type": "string", "logical_type": "datetime"}, 

168 "constraints": [ 

169 {"type": "column_not_null"}, 

170 {"type": "column_unique"} 

171 ] 

172 }, 

173 { 

174 "name": "COL2", 

175 "type": {"type": "int"}, 

176 "constraints": [ 

177 {"type": "column_not_null"} 

178 ] 

179 }, 

180 { 

181 "name": "COL3", 

182 "type": {"type": "int"}, 

183 "constraints": [ 

184 {"type": "column_not_null"}, 

185 {"type": "column_unique", "opts": {"case_sensitive": True}} 

186 ] 

187 }, 

188 { 

189 "name": "COL4", 

190 "type": {"type": "int"} 

191 } 

192 ] 

193 } 

194 endpoint_schema = data_schema["endpoints"][0] 

195 

196 mocker.patch('sdc_etl_libs.sdc_data_validation.data_validation.SDCDataValidation.connect_to_client', 

197 return_value=True) 

198 

199 validation = SDCDataValidation(data_schema_=data_schema, endpoint_schema_=endpoint_schema, job_configuration_=None) 

200 

201 # Testing that tests are generated properly 

202 assert validation.data_tests == [ 

203 {'column_check': {}}, # Endpoint-defined test generated properly 

204 {'column_unique': {'columns': ['COL3'], 'case_sensitive': True}}, # QUALITY test not-grouped due to opts 

205 {'column_unique': {'columns': ['COL1']}}, # QUALITY test generated properly 

206 {'column_not_null': {'columns': ['COL1', 'COL2', 'COL3']}} # QUALITY test grouping applicable tests together 

207 ] 

208 

209 

210def test_parsing_data_schema_quality_test_in_field_not_in_endpoint(mocker): 

211 """ 

212 Testing that, given a data schema and endpoint schema, a constraint that is in the field (i.e. column_not_null) is not 

213 set in a data test if it is not defined in the Endpoint validation section. 

214 """ 

215 

216 mocker.patch('sdc_etl_libs.sdc_data_validation.data_validation.SDCDataValidation.connect_to_client', 

217 return_value=True) 

218 

219 data_schema = { 

220 "namespace": "validation-test", 

221 "name": "validation-test", 

222 "type": "object", 

223 "country_code": "USA", 

224 "estimated_row_size": "64b", 

225 "estimated_row_count": 100, 

226 "endpoints": [ 

227 { 

228 "type": "sink", 

229 "tag": "SDC_sink_0", 

230 "info": { 

231 "type": "snowflake", 

232 "access": { 

233 "account": "EXAMPLE", 

234 "database": "DATAENGINEERING", 

235 "table_name": "EXAMPLE", 

236 "schema": "EXAMPLE", 

237 "credentials": { 

238 "type": "aws_secrets", 

239 "opts": {"name": "snowflake/service_account/airflow"} 

240 } 

241 }, 

242 "opts": { 

243 "upsert": False, 

244 "dedupe": False, 

245 "style": "snowflake" 

246 }, 

247 "validation": { 

248 "log": True, 

249 "access": { 

250 "credentials": { 

251 "type": "aws_secrets", 

252 "opts": {"name": "snowflake/service_account/airflow"} 

253 } 

254 }, 

255 "tests": [ 

256 {"type": "column_unique"} 

257 ] 

258 } 

259 } 

260 } 

261 ], 

262 "fields": [ 

263 { 

264 "name": "COL1", 

265 "type": {"type": "string", "logical_type": "datetime"}, 

266 "constraints": [ 

267 {"type": "column_not_null"}, 

268 {"type": "column_unique"} 

269 ] 

270 }, 

271 { 

272 "name": "COL2", 

273 "type": {"type": "int"}, 

274 "constraints": [ 

275 {"type": "column_not_null"} 

276 ] 

277 }, 

278 { 

279 "name": "COL3", 

280 "type": {"type": "int"}, 

281 "constraints": [ 

282 {"type": "column_not_null"} 

283 ] 

284 }, 

285 { 

286 "name": "COL4", 

287 "type": {"type": "int"} 

288 } 

289 ] 

290 } 

291 endpoint_schema = data_schema["endpoints"][0] 

292 

293 validation = SDCDataValidation(data_schema_=data_schema, endpoint_schema_=endpoint_schema, 

294 job_configuration_=None) 

295 

296 # Testing that tests are generated properly 

297 assert validation.data_tests == [ 

298 {'column_unique': {'columns': ['COL1']}}, # QUALITY test generated properly 

299 # column_not_null test should not be in here since it's not part of the endpoint's [validation][tests] section 

300 ] 

301 

302 

303def test_execute_data_tests(mocker): 

304 """ 

305 Tests that everything goes right for a data test when calling execute_data_tests() 

306 """ 

307 

308 data_schema = { 

309 "namespace": "validation-test", 

310 "name": "validation-test", 

311 "type": "object", 

312 "country_code": "USA", 

313 "estimated_row_size": "64b", 

314 "estimated_row_count": 100, 

315 "endpoints": [ 

316 { 

317 "type": "sink", 

318 "tag": "SDC_sink_0", 

319 "info": { 

320 "type": "snowflake", 

321 "access": { 

322 "account": "EXAMPLE", 

323 "database": "DATAENGINEERING", 

324 "table_name": "EXAMPLE", 

325 "schema": "EXAMPLE", 

326 "credentials": { 

327 "type": "aws_secrets", 

328 "opts": {"name": "snowflake/service_account/airflow"} 

329 } 

330 }, 

331 "opts": { 

332 "upsert": False, 

333 "dedupe": False, 

334 "style": "snowflake" 

335 }, 

336 "validation": { 

337 "log": True, 

338 "access": { 

339 "credentials": { 

340 "type": "aws_secrets", 

341 "opts": {"name": "snowflake/service_account/airflow"} 

342 } 

343 }, 

344 "tests": [ 

345 {"type": "column_check"}, 

346 {"type": "column_unique"} 

347 ] 

348 } 

349 } 

350 } 

351 ], 

352 "fields": [ 

353 { 

354 "name": "COL1", 

355 "type": {"type": "string", "logical_type": "datetime"}, 

356 "constraints": [ 

357 {"type": "column_not_null"}, 

358 {"type": "column_unique"} 

359 ] 

360 }, 

361 { 

362 "name": "COL2", 

363 "type": {"type": "int"}, 

364 "constraints": [ 

365 {"type": "column_not_null"} 

366 ] 

367 }, 

368 { 

369 "name": "COL3", 

370 "type": {"type": "int"}, 

371 "constraints": [ 

372 {"type": "column_not_null"} 

373 ] 

374 }, 

375 { 

376 "name": "COL4", 

377 "type": {"type": "int"} 

378 } 

379 ] 

380 } 

381 endpoint_schema = data_schema["endpoints"][0] 

382 

383 mocker.patch('sdc_etl_libs.sdc_data_validation.data_validation.SDCDataValidation.connect_to_client', 

384 return_value=True) 

385 

386 validation = SDCDataValidation(data_schema_=data_schema, endpoint_schema_=endpoint_schema, 

387 job_configuration_=None) 

388 validation.database_info["database_client"] = True 

389 

390 # Testing that tests are generated properly 

391 assert validation.data_tests == [ 

392 {'column_check': {}}, 

393 {'column_unique': {'columns': ['COL1']}}, # QUALITY test generated properlyon 

394 ] 

395 

396 mock_test_result = [{ 

397 'test_number': 1, 

398 'status': 'PASS', 

399 'reason': 'PASS', 

400 'test_name': 'column_unique', 

401 'message': '' 

402 }] 

403 

404 mocker.patch('sdc_etl_libs.sdc_data_validation.validation_tests.column_unique.ColumnUnique.run_validation_test', 

405 return_value=mock_test_result) 

406 mocker.patch('sdc_etl_libs.sdc_data_validation.validation_tests.column_check.ColumnCheck.run_validation_test', 

407 return_value=mock_test_result) 

408 

409 # Testing calling execute_data_tests on QUALITY tests 

410 validation.execute_data_tests(run_against_="database", 

411 sdc_dataframe_=None, 

412 test_types_=["QUALITY"]) 

413 

414 # Assert that the actual QUALITY data test class was called during the execution 

415 sdc_etl_libs.sdc_data_validation.validation_tests.column_unique.ColumnUnique.run_validation_test.assert_called() 

416 

417 # Assert that the METDATA test data test class was NOT called during the execution as it was not part of execute_data_tests() 

418 sdc_etl_libs.sdc_data_validation.validation_tests.column_check.ColumnCheck.run_validation_test.assert_not_called() 

419 

420 # Assert that total results is not None 

421 assert len(validation.total_results) == 1 

422 # Assert that completed data tests is incremented 

423 assert validation.completed_data_tests == 1