Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1""" Data Validation Test to check that column metadata will match up between a source and sink """ 

2 

3from sdc_etl_libs.sdc_data_validation.data_validation_test import ValidationTest 

4from sdc_etl_libs.sdc_data_validation.data_validation_test_enums import ValidationTests 

5from sdc_etl_libs.sdc_data_validation.data_validation_enums import ValidationTestTypes, ValidationRunResultStatus, ValidationRunResultReason 

6from sdc_etl_libs.database_helpers.SnowflakeDatabase import SnowflakeDatabase 

7from sdc_etl_libs.sdc_data_schema import schema_enums as SchemaEnums 

8from sdc_etl_libs.sdc_exchange_endpoints.database_endpoints.SDCColumnCleanseStylesEnums import StyleEnums 

9from sdc_etl_libs.sdc_data_validation.data_validation_exceptions import SDCDataValidationError 

10 

11 

12class ColumnCheck(ValidationTest): 

13 

14 def __init__(self, **kwargs): 

15 """ 

16 Constructor for data test. 

17 :param **kwargs: See ValidationTest base class constructor for information on this parameter. 

18 """ 

19 

20 super().__init__() 

21 

22 self.test_number = kwargs["test_number"] 

23 self.sdc_dataframe = kwargs["sdc_dataframe"] 

24 self.database_info = kwargs["database_info"] 

25 self.data_schema = kwargs["data_schema"] 

26 self.endpoint_schema = kwargs["endpoint_schema"] 

27 

28 # Set test name and extract additional details from validation_test_enums: 

29 self.validation_test_name = "column_check" 

30 self.test_type = ValidationTests[self.validation_test_name].value["test_type"] 

31 

32 # Set params for what data type tests will be run against and check for completeness: 

33 self.check_run_against_parameters() 

34 

35 # Set test-specific attributes that came over as unpacked options from parsed data test info: 

36 self.db = self.database_info["database_client"] 

37 

38 def run_validation_test(self): 

39 """ 

40 Compares data schema column information to database column information. 

41 :return: List of Dicts. Results of running data tests. 

42 """ 

43 

44 schema_cols = self.get_data_schema_column_info() 

45 database_cols = self.get_database_column_info() 

46 self.compare_column_results(schema_cols_=schema_cols, database_cols_=database_cols) 

47 

48 return self.results 

49 

50 def get_data_schema_column_info(self): 

51 """ 

52 Iterates over the data schema and grabs column name and data type information. "rename" is used for column name 

53 is present in schema. Names are also converted to the column style set in the endpoints [info][opts][style] key. 

54 :return: Dict. Key/value of data schema column name's -> column data type's. 

55 """ 

56 

57 # Get field names and data types from data schema 

58 schema_cols = {} 

59 for field in self.data_schema["fields"]: 

60 if field.get("drop_column") is True: 

61 continue 

62 else: 

63 # Apply style formatting to column name to ensure it matches to the database column name 

64 if self.database_info["database_type"] == "snowflake": 

65 col_name = SnowflakeDatabase.clean_column_name(column_name_=field.get('rename') or field['name'], 

66 rules_=StyleEnums[self.endpoint_schema["info"]["opts"][ 

67 "style"].upper()]) 

68 else: 

69 raise SDCDataValidationError(f"'{self.validation_test_name}' not setup for database type " 

70 f"'{self.database_info['database_type']}'") 

71 col_type = field["type"].get("logical_type") or field["type"]["type"] 

72 schema_cols[col_name] = col_type 

73 

74 return schema_cols 

75 

76 def get_database_column_info(self): 

77 """ 

78 Runs the necessary queries and extracts data for column names and data types. Data types given by the database 

79 are mapped back to the data type names used in a data schema for comparison (i.e. "TEXT" -> "string"). 

80 return: Dict. Key/value of database column name's -> column data type's. 

81 """ 

82 

83 # Get column names and data types from database table 

84 database_cols = {} 

85 

86 if self.database_info["database_type"] == "snowflake": 

87 # The best way to get data types is to use "SHOW COLUMNS" query. You cannot give a database/schema, so must 

88 # run "USE" queries for each first 

89 self.db.execute_query(f'USE DATABASE "{self.database_info["database_name"]}";') 

90 self.db.execute_query(f'USE SCHEMA "{self.database_info["schema_name"]}";') 

91 # "SHOW COLUMNS" will give various information on columns. There is a "data_type" column which is a JSON 

92 # field that looks like this: {"type":"FIXED","precision":38,"scale":0,"nullable":true} 

93 self.db.execute_query(f'SHOW COLUMNS IN "{self.database_info["table_name"]}";') 

94 

95 # We run this "result_scan(last_query_id())" query that will return the data from the last query ran in the 

96 # session - which will be from the SHOW COLUMNS query above. We fetch the column name + parse the JSON in 

97 # the data_type column to get the data type 

98 self.db.execute_query(""" 

99 SELECT  

100 "column_name" COL_NAME,  

101 parse_json("data_type"):type::string COL_TYPE  

102 FROM (select * from table(result_scan(last_query_id())));  

103 """, return_results_=True) 

104 

105 for rec in self.db.get_results(): 

106 col_name = rec[0] 

107 # Here we map the Snowflake datatype pulled from "SHOW COLUMNS" query to it's equivalent name in the data schema 

108 col_type = SchemaEnums.SnowflakeDataTypesToSchemaDataTypesMapper.__members__[rec[1]].value 

109 database_cols[col_name] = col_type 

110 else: 

111 raise SDCDataValidationError(f"'{self.validation_test_name}' not setup for database type " 

112 f"'{self.database_info['database_type']}'") 

113 

114 return database_cols 

115 

116 def compare_column_results(self, schema_cols_, database_cols_): 

117 """ 

118 Compares the data schema column information to database column information and logs a result record if 

119 there is a discrepancy. 

120 :param schema_cols_: Dict. Key/value of data schema column name's -> column data type's. 

121 :param database_cols_: Dict. Key/value of database column name's -> column data type's. 

122 :return: None. 

123 """ 

124 

125 # Find columns that exist in the schema but not the database 

126 columns_in_schema_not_db = set(schema_cols_) - set(database_cols_) 

127 if columns_in_schema_not_db: 

128 self.capture_result_record( 

129 test_number_=self.test_number, 

130 result_status_=ValidationRunResultStatus.FAIL, 

131 result_reason_=ValidationRunResultReason.COLUMN_NAME_MISMATCH, 

132 test_name_=self.validation_test_name, 

133 message_=f'{len(columns_in_schema_not_db):,} column(s) exist in the data schema but not in the database: ' 

134 f'{list(columns_in_schema_not_db)}' 

135 ) 

136 

137 # Find columns that exist in the database but not the schema 

138 columns_in_db_not_schema = set(database_cols_) - set(schema_cols_) 

139 if columns_in_db_not_schema: 

140 self.capture_result_record( 

141 test_number_=self.test_number, 

142 result_status_=ValidationRunResultStatus.FAIL, 

143 result_reason_=ValidationRunResultReason.COLUMN_NAME_MISMATCH, 

144 test_name_=self.validation_test_name, 

145 message_=f'{len(columns_in_db_not_schema):,} column(s) exist in the database but not in the data schema: ' 

146 f'{list(columns_in_db_not_schema)}' 

147 ) 

148 

149 # Log result message for passes 

150 if not columns_in_db_not_schema and not columns_in_schema_not_db: 

151 self.capture_result_record( 

152 test_number_=self.test_number, 

153 result_status_=ValidationRunResultStatus.PASS, 

154 result_reason_=ValidationRunResultStatus.PASS, 

155 test_name_=self.validation_test_name, 

156 message_="No column mismatches detected between data schema and database" 

157 ) 

158 else: 

159 if not columns_in_db_not_schema: 

160 self.capture_result_record( 

161 test_number_=self.test_number, 

162 result_status_=ValidationRunResultStatus.PASS, 

163 result_reason_=ValidationRunResultStatus.PASS, 

164 test_name_=self.validation_test_name, 

165 message_="All columns in the database exist in the data schema" 

166 ) 

167 elif not columns_in_schema_not_db: 

168 self.capture_result_record( 

169 test_number_=self.test_number, 

170 result_status_=ValidationRunResultStatus.PASS, 

171 result_reason_=ValidationRunResultStatus.PASS, 

172 test_name_=self.validation_test_name, 

173 message_="All columns in the data schema exist in the database" 

174 ) 

175 

176 # Find columns where the data types are not the same between the schema and database 

177 datatype_test_failures = 0 

178 for col_name, col_type in schema_cols_.items(): 

179 if col_name in database_cols_: 

180 if col_type != database_cols_[col_name]: 

181 datatype_test_failures += 1 

182 self.capture_result_record( 

183 test_number_=self.test_number, 

184 result_status_=ValidationRunResultStatus.FAIL, 

185 result_reason_=ValidationRunResultReason.DATATYPE_MISMATCH, 

186 test_name_=self.validation_test_name, 

187 message_=f'Datatype mismatch for "{col_name}". database: "{database_cols_[col_name]}", schema: "{col_type}"' 

188 ) 

189 if datatype_test_failures: 

190 pass 

191 else: 

192 self.capture_result_record( 

193 test_number_=self.test_number, 

194 result_status_=ValidationRunResultStatus.PASS, 

195 result_reason_=ValidationRunResultStatus.PASS, 

196 test_name_=self.validation_test_name, 

197 message_=f'All data types between database and data schema match' 

198 ) 

199 

200 return self.results