Coverage for libs/sdc_etl_libs/sdc_data_validation/validation_tests/column_check.py : 18%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1""" Data Validation Test to check that column metadata will match up between a source and sink """
3from sdc_etl_libs.sdc_data_validation.data_validation_test import ValidationTest
4from sdc_etl_libs.sdc_data_validation.data_validation_test_enums import ValidationTests
5from sdc_etl_libs.sdc_data_validation.data_validation_enums import ValidationTestTypes, ValidationRunResultStatus, ValidationRunResultReason
6from sdc_etl_libs.database_helpers.SnowflakeDatabase import SnowflakeDatabase
7from sdc_etl_libs.sdc_data_schema import schema_enums as SchemaEnums
8from sdc_etl_libs.sdc_exchange_endpoints.database_endpoints.SDCColumnCleanseStylesEnums import StyleEnums
9from sdc_etl_libs.sdc_data_validation.data_validation_exceptions import SDCDataValidationError
12class ColumnCheck(ValidationTest):
14 def __init__(self, **kwargs):
15 """
16 Constructor for data test.
17 :param **kwargs: See ValidationTest base class constructor for information on this parameter.
18 """
20 super().__init__()
22 self.test_number = kwargs["test_number"]
23 self.sdc_dataframe = kwargs["sdc_dataframe"]
24 self.database_info = kwargs["database_info"]
25 self.data_schema = kwargs["data_schema"]
26 self.endpoint_schema = kwargs["endpoint_schema"]
28 # Set test name and extract additional details from validation_test_enums:
29 self.validation_test_name = "column_check"
30 self.test_type = ValidationTests[self.validation_test_name].value["test_type"]
32 # Set params for what data type tests will be run against and check for completeness:
33 self.check_run_against_parameters()
35 # Set test-specific attributes that came over as unpacked options from parsed data test info:
36 self.db = self.database_info["database_client"]
38 def run_validation_test(self):
39 """
40 Compares data schema column information to database column information.
41 :return: List of Dicts. Results of running data tests.
42 """
44 schema_cols = self.get_data_schema_column_info()
45 database_cols = self.get_database_column_info()
46 self.compare_column_results(schema_cols_=schema_cols, database_cols_=database_cols)
48 return self.results
50 def get_data_schema_column_info(self):
51 """
52 Iterates over the data schema and grabs column name and data type information. "rename" is used for column name
53 is present in schema. Names are also converted to the column style set in the endpoints [info][opts][style] key.
54 :return: Dict. Key/value of data schema column name's -> column data type's.
55 """
57 # Get field names and data types from data schema
58 schema_cols = {}
59 for field in self.data_schema["fields"]:
60 if field.get("drop_column") is True:
61 continue
62 else:
63 # Apply style formatting to column name to ensure it matches to the database column name
64 if self.database_info["database_type"] == "snowflake":
65 col_name = SnowflakeDatabase.clean_column_name(column_name_=field.get('rename') or field['name'],
66 rules_=StyleEnums[self.endpoint_schema["info"]["opts"][
67 "style"].upper()])
68 else:
69 raise SDCDataValidationError(f"'{self.validation_test_name}' not setup for database type "
70 f"'{self.database_info['database_type']}'")
71 col_type = field["type"].get("logical_type") or field["type"]["type"]
72 schema_cols[col_name] = col_type
74 return schema_cols
76 def get_database_column_info(self):
77 """
78 Runs the necessary queries and extracts data for column names and data types. Data types given by the database
79 are mapped back to the data type names used in a data schema for comparison (i.e. "TEXT" -> "string").
80 return: Dict. Key/value of database column name's -> column data type's.
81 """
83 # Get column names and data types from database table
84 database_cols = {}
86 if self.database_info["database_type"] == "snowflake":
87 # The best way to get data types is to use "SHOW COLUMNS" query. You cannot give a database/schema, so must
88 # run "USE" queries for each first
89 self.db.execute_query(f'USE DATABASE "{self.database_info["database_name"]}";')
90 self.db.execute_query(f'USE SCHEMA "{self.database_info["schema_name"]}";')
91 # "SHOW COLUMNS" will give various information on columns. There is a "data_type" column which is a JSON
92 # field that looks like this: {"type":"FIXED","precision":38,"scale":0,"nullable":true}
93 self.db.execute_query(f'SHOW COLUMNS IN "{self.database_info["table_name"]}";')
95 # We run this "result_scan(last_query_id())" query that will return the data from the last query ran in the
96 # session - which will be from the SHOW COLUMNS query above. We fetch the column name + parse the JSON in
97 # the data_type column to get the data type
98 self.db.execute_query("""
99 SELECT
100 "column_name" COL_NAME,
101 parse_json("data_type"):type::string COL_TYPE
102 FROM (select * from table(result_scan(last_query_id())));
103 """, return_results_=True)
105 for rec in self.db.get_results():
106 col_name = rec[0]
107 # Here we map the Snowflake datatype pulled from "SHOW COLUMNS" query to it's equivalent name in the data schema
108 col_type = SchemaEnums.SnowflakeDataTypesToSchemaDataTypesMapper.__members__[rec[1]].value
109 database_cols[col_name] = col_type
110 else:
111 raise SDCDataValidationError(f"'{self.validation_test_name}' not setup for database type "
112 f"'{self.database_info['database_type']}'")
114 return database_cols
116 def compare_column_results(self, schema_cols_, database_cols_):
117 """
118 Compares the data schema column information to database column information and logs a result record if
119 there is a discrepancy.
120 :param schema_cols_: Dict. Key/value of data schema column name's -> column data type's.
121 :param database_cols_: Dict. Key/value of database column name's -> column data type's.
122 :return: None.
123 """
125 # Find columns that exist in the schema but not the database
126 columns_in_schema_not_db = set(schema_cols_) - set(database_cols_)
127 if columns_in_schema_not_db:
128 self.capture_result_record(
129 test_number_=self.test_number,
130 result_status_=ValidationRunResultStatus.FAIL,
131 result_reason_=ValidationRunResultReason.COLUMN_NAME_MISMATCH,
132 test_name_=self.validation_test_name,
133 message_=f'{len(columns_in_schema_not_db):,} column(s) exist in the data schema but not in the database: '
134 f'{list(columns_in_schema_not_db)}'
135 )
137 # Find columns that exist in the database but not the schema
138 columns_in_db_not_schema = set(database_cols_) - set(schema_cols_)
139 if columns_in_db_not_schema:
140 self.capture_result_record(
141 test_number_=self.test_number,
142 result_status_=ValidationRunResultStatus.FAIL,
143 result_reason_=ValidationRunResultReason.COLUMN_NAME_MISMATCH,
144 test_name_=self.validation_test_name,
145 message_=f'{len(columns_in_db_not_schema):,} column(s) exist in the database but not in the data schema: '
146 f'{list(columns_in_db_not_schema)}'
147 )
149 # Log result message for passes
150 if not columns_in_db_not_schema and not columns_in_schema_not_db:
151 self.capture_result_record(
152 test_number_=self.test_number,
153 result_status_=ValidationRunResultStatus.PASS,
154 result_reason_=ValidationRunResultStatus.PASS,
155 test_name_=self.validation_test_name,
156 message_="No column mismatches detected between data schema and database"
157 )
158 else:
159 if not columns_in_db_not_schema:
160 self.capture_result_record(
161 test_number_=self.test_number,
162 result_status_=ValidationRunResultStatus.PASS,
163 result_reason_=ValidationRunResultStatus.PASS,
164 test_name_=self.validation_test_name,
165 message_="All columns in the database exist in the data schema"
166 )
167 elif not columns_in_schema_not_db:
168 self.capture_result_record(
169 test_number_=self.test_number,
170 result_status_=ValidationRunResultStatus.PASS,
171 result_reason_=ValidationRunResultStatus.PASS,
172 test_name_=self.validation_test_name,
173 message_="All columns in the data schema exist in the database"
174 )
176 # Find columns where the data types are not the same between the schema and database
177 datatype_test_failures = 0
178 for col_name, col_type in schema_cols_.items():
179 if col_name in database_cols_:
180 if col_type != database_cols_[col_name]:
181 datatype_test_failures += 1
182 self.capture_result_record(
183 test_number_=self.test_number,
184 result_status_=ValidationRunResultStatus.FAIL,
185 result_reason_=ValidationRunResultReason.DATATYPE_MISMATCH,
186 test_name_=self.validation_test_name,
187 message_=f'Datatype mismatch for "{col_name}". database: "{database_cols_[col_name]}", schema: "{col_type}"'
188 )
189 if datatype_test_failures:
190 pass
191 else:
192 self.capture_result_record(
193 test_number_=self.test_number,
194 result_status_=ValidationRunResultStatus.PASS,
195 result_reason_=ValidationRunResultStatus.PASS,
196 test_name_=self.validation_test_name,
197 message_=f'All data types between database and data schema match'
198 )
200 return self.results