Coverage for libs/sdc_etl_libs/tests/sdc_data_validation_tests/data_validation_tests.py : 100%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
2import os
3import sys
4import pytest
5sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../../../")
6import sdc_etl_libs
7from sdc_etl_libs.sdc_data_validation.data_validation import SDCDataValidation
8from sdc_etl_libs.sdc_data_validation.data_validation_exceptions import SDCDataValidationError, SDCDataValidationSetupError
11def test_check_parameters(mocker):
13 mocker.patch('sdc_etl_libs.sdc_data_validation.data_validation.SDCDataValidation.connect_to_client', return_value=True)
15 data_schema = {
16 "namespace": "validation-test",
17 "name": "validation-test",
18 "type": "object",
19 "country_code": "USA",
20 "estimated_row_size": "64b",
21 "estimated_row_count": 100,
22 "endpoints": [
23 {
24 "type": "sink",
25 "tag": "SDC_sink_0",
26 "info": {
27 "type": "snowflake",
28 "access": {
29 "account": "EXAMPLE",
30 "database": "DATAENGINEERING",
31 "table_name": "EXAMPLE",
32 "schema": "EXAMPLE",
33 "credentials": {
34 "type": "aws_secrets",
35 "opts": {"name": "snowflake/service_account/airflow"}
36 }
37 },
38 "opts": {
39 "upsert": False,
40 "dedupe": False,
41 "style": "snowflake"
42 },
43 "validation": {
44 "log": True,
45 "access": {
46 "credentials": {
47 "type": "aws_secrets",
48 "opts": {"name": "snowflake/service_account/airflow"}
49 }
50 },
51 "tests": [
52 {"type": "column_unique"},
53 {"type": "column_check"},
54 {"type": "column_not_null"}
55 ]
56 }
57 }
58 }
59 ],
60 "fields": [
61 {
62 "name": "COL1",
63 "type": {"type": "string", "logical_type": "datetime"},
64 "constraints": [
65 {"type": "column_not_null"}
66 ]
67 },
68 {
69 "name": "COL2",
70 "type": {"type": "int"},
71 "constraints": [
72 {"type": "column_not_null"}
73 ]
74 },
75 {
76 "name": "COL3",
77 "type": {"type": "int"},
78 "constraints": [
79 {"type": "column_not_null"},
80 {"type": "column_unique", "opts": {"case_sensitive": True}}
81 ]
82 },
83 {
84 "name": "COL4",
85 "type": {"type": "int"}
86 }
87 ]
88 }
89 endpoint_schema = data_schema["endpoints"][0]
91 # Testing that class initializes with a data schema and endpoint schema
92 validation = SDCDataValidation(data_schema_=data_schema, endpoint_schema_=endpoint_schema, job_configuration_=None)
94 assert validation
96 # Testing that class initializes with a job configuration, but, will raise NotImplementedError since it's not supported yet
97 with pytest.raises(NotImplementedError):
98 validation = SDCDataValidation(data_schema_=None, endpoint_schema_=None, job_configuration_="example")
100 # Testing that class raises SDCDataValidationSetupError if endpoint schema is missing with schema setup
101 with pytest.raises(SDCDataValidationSetupError):
102 validation = SDCDataValidation(data_schema_=data_schema, endpoint_schema_=None, job_configuration_=None)
104 # Testing that class raises SDCDataValidationSetupError if data schema is missing with schema setup
105 with pytest.raises(SDCDataValidationSetupError):
106 validation = SDCDataValidation(data_schema_=None, endpoint_schema_=endpoint_schema, job_configuration_=None)
108 # Testing that class raises SDCDataValidationSetupError if schemas or job configuration is not passed
109 with pytest.raises(SDCDataValidationSetupError):
110 validation = SDCDataValidation(data_schema_=None, endpoint_schema_=None, job_configuration_=None)
113def test_parsing_data_schema_happy_path(mocker):
114 """
115 Testing that, given a data schema and endpoint schema, the data tests are properly parsed and set into the
116 Validator attribute.
117 """
119 data_schema = {
120 "namespace": "validation-test",
121 "name": "validation-test",
122 "type": "object",
123 "country_code": "USA",
124 "estimated_row_size": "64b",
125 "estimated_row_count": 100,
126 "endpoints": [
127 {
128 "type": "sink",
129 "tag": "SDC_sink_0",
130 "info": {
131 "type": "snowflake",
132 "access": {
133 "account": "EXAMPLE",
134 "database": "DATAENGINEERING",
135 "table_name": "EXAMPLE",
136 "schema": "EXAMPLE",
137 "credentials": {
138 "type": "aws_secrets",
139 "opts": {"name": "snowflake/service_account/airflow"}
140 }
141 },
142 "opts": {
143 "upsert": False,
144 "dedupe": False,
145 "style": "snowflake"
146 },
147 "validation": {
148 "log": True,
149 "access": {
150 "credentials": {
151 "type": "aws_secrets",
152 "opts": {"name": "snowflake/service_account/airflow"}
153 }
154 },
155 "tests": [
156 {"type": "column_unique"},
157 {"type": "column_check"},
158 {"type": "column_not_null"}
159 ]
160 }
161 }
162 }
163 ],
164 "fields": [
165 {
166 "name": "COL1",
167 "type": {"type": "string", "logical_type": "datetime"},
168 "constraints": [
169 {"type": "column_not_null"},
170 {"type": "column_unique"}
171 ]
172 },
173 {
174 "name": "COL2",
175 "type": {"type": "int"},
176 "constraints": [
177 {"type": "column_not_null"}
178 ]
179 },
180 {
181 "name": "COL3",
182 "type": {"type": "int"},
183 "constraints": [
184 {"type": "column_not_null"},
185 {"type": "column_unique", "opts": {"case_sensitive": True}}
186 ]
187 },
188 {
189 "name": "COL4",
190 "type": {"type": "int"}
191 }
192 ]
193 }
194 endpoint_schema = data_schema["endpoints"][0]
196 mocker.patch('sdc_etl_libs.sdc_data_validation.data_validation.SDCDataValidation.connect_to_client',
197 return_value=True)
199 validation = SDCDataValidation(data_schema_=data_schema, endpoint_schema_=endpoint_schema, job_configuration_=None)
201 # Testing that tests are generated properly
202 assert validation.data_tests == [
203 {'column_check': {}}, # Endpoint-defined test generated properly
204 {'column_unique': {'columns': ['COL3'], 'case_sensitive': True}}, # QUALITY test not-grouped due to opts
205 {'column_unique': {'columns': ['COL1']}}, # QUALITY test generated properly
206 {'column_not_null': {'columns': ['COL1', 'COL2', 'COL3']}} # QUALITY test grouping applicable tests together
207 ]
210def test_parsing_data_schema_quality_test_in_field_not_in_endpoint(mocker):
211 """
212 Testing that, given a data schema and endpoint schema, a constraint that is in the field (i.e. column_not_null) is not
213 set in a data test if it is not defined in the Endpoint validation section.
214 """
216 mocker.patch('sdc_etl_libs.sdc_data_validation.data_validation.SDCDataValidation.connect_to_client',
217 return_value=True)
219 data_schema = {
220 "namespace": "validation-test",
221 "name": "validation-test",
222 "type": "object",
223 "country_code": "USA",
224 "estimated_row_size": "64b",
225 "estimated_row_count": 100,
226 "endpoints": [
227 {
228 "type": "sink",
229 "tag": "SDC_sink_0",
230 "info": {
231 "type": "snowflake",
232 "access": {
233 "account": "EXAMPLE",
234 "database": "DATAENGINEERING",
235 "table_name": "EXAMPLE",
236 "schema": "EXAMPLE",
237 "credentials": {
238 "type": "aws_secrets",
239 "opts": {"name": "snowflake/service_account/airflow"}
240 }
241 },
242 "opts": {
243 "upsert": False,
244 "dedupe": False,
245 "style": "snowflake"
246 },
247 "validation": {
248 "log": True,
249 "access": {
250 "credentials": {
251 "type": "aws_secrets",
252 "opts": {"name": "snowflake/service_account/airflow"}
253 }
254 },
255 "tests": [
256 {"type": "column_unique"}
257 ]
258 }
259 }
260 }
261 ],
262 "fields": [
263 {
264 "name": "COL1",
265 "type": {"type": "string", "logical_type": "datetime"},
266 "constraints": [
267 {"type": "column_not_null"},
268 {"type": "column_unique"}
269 ]
270 },
271 {
272 "name": "COL2",
273 "type": {"type": "int"},
274 "constraints": [
275 {"type": "column_not_null"}
276 ]
277 },
278 {
279 "name": "COL3",
280 "type": {"type": "int"},
281 "constraints": [
282 {"type": "column_not_null"}
283 ]
284 },
285 {
286 "name": "COL4",
287 "type": {"type": "int"}
288 }
289 ]
290 }
291 endpoint_schema = data_schema["endpoints"][0]
293 validation = SDCDataValidation(data_schema_=data_schema, endpoint_schema_=endpoint_schema,
294 job_configuration_=None)
296 # Testing that tests are generated properly
297 assert validation.data_tests == [
298 {'column_unique': {'columns': ['COL1']}}, # QUALITY test generated properly
299 # column_not_null test should not be in here since it's not part of the endpoint's [validation][tests] section
300 ]
303def test_execute_data_tests(mocker):
304 """
305 Tests that everything goes right for a data test when calling execute_data_tests()
306 """
308 data_schema = {
309 "namespace": "validation-test",
310 "name": "validation-test",
311 "type": "object",
312 "country_code": "USA",
313 "estimated_row_size": "64b",
314 "estimated_row_count": 100,
315 "endpoints": [
316 {
317 "type": "sink",
318 "tag": "SDC_sink_0",
319 "info": {
320 "type": "snowflake",
321 "access": {
322 "account": "EXAMPLE",
323 "database": "DATAENGINEERING",
324 "table_name": "EXAMPLE",
325 "schema": "EXAMPLE",
326 "credentials": {
327 "type": "aws_secrets",
328 "opts": {"name": "snowflake/service_account/airflow"}
329 }
330 },
331 "opts": {
332 "upsert": False,
333 "dedupe": False,
334 "style": "snowflake"
335 },
336 "validation": {
337 "log": True,
338 "access": {
339 "credentials": {
340 "type": "aws_secrets",
341 "opts": {"name": "snowflake/service_account/airflow"}
342 }
343 },
344 "tests": [
345 {"type": "column_check"},
346 {"type": "column_unique"}
347 ]
348 }
349 }
350 }
351 ],
352 "fields": [
353 {
354 "name": "COL1",
355 "type": {"type": "string", "logical_type": "datetime"},
356 "constraints": [
357 {"type": "column_not_null"},
358 {"type": "column_unique"}
359 ]
360 },
361 {
362 "name": "COL2",
363 "type": {"type": "int"},
364 "constraints": [
365 {"type": "column_not_null"}
366 ]
367 },
368 {
369 "name": "COL3",
370 "type": {"type": "int"},
371 "constraints": [
372 {"type": "column_not_null"}
373 ]
374 },
375 {
376 "name": "COL4",
377 "type": {"type": "int"}
378 }
379 ]
380 }
381 endpoint_schema = data_schema["endpoints"][0]
383 mocker.patch('sdc_etl_libs.sdc_data_validation.data_validation.SDCDataValidation.connect_to_client',
384 return_value=True)
386 validation = SDCDataValidation(data_schema_=data_schema, endpoint_schema_=endpoint_schema,
387 job_configuration_=None)
388 validation.database_info["database_client"] = True
390 # Testing that tests are generated properly
391 assert validation.data_tests == [
392 {'column_check': {}},
393 {'column_unique': {'columns': ['COL1']}}, # QUALITY test generated properlyon
394 ]
396 mock_test_result = [{
397 'test_number': 1,
398 'status': 'PASS',
399 'reason': 'PASS',
400 'test_name': 'column_unique',
401 'message': ''
402 }]
404 mocker.patch('sdc_etl_libs.sdc_data_validation.validation_tests.column_unique.ColumnUnique.run_validation_test',
405 return_value=mock_test_result)
406 mocker.patch('sdc_etl_libs.sdc_data_validation.validation_tests.column_check.ColumnCheck.run_validation_test',
407 return_value=mock_test_result)
409 # Testing calling execute_data_tests on QUALITY tests
410 validation.execute_data_tests(run_against_="database",
411 sdc_dataframe_=None,
412 test_types_=["QUALITY"])
414 # Assert that the actual QUALITY data test class was called during the execution
415 sdc_etl_libs.sdc_data_validation.validation_tests.column_unique.ColumnUnique.run_validation_test.assert_called()
417 # Assert that the METDATA test data test class was NOT called during the execution as it was not part of execute_data_tests()
418 sdc_etl_libs.sdc_data_validation.validation_tests.column_check.ColumnCheck.run_validation_test.assert_not_called()
420 # Assert that total results is not None
421 assert len(validation.total_results) == 1
422 # Assert that completed data tests is incremented
423 assert validation.completed_data_tests == 1