Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1""" Module with useful tools/function for generating code from a data schema """ 

2 

3import json 

4import logging 

5import re 

6 

7import requests 

8 

9from sdc_etl_libs.database_helpers.Snowflake.snowflake_ddl_helpers import SnowflakeDDLHelpers 

10from sdc_etl_libs.sdc_data_exchange.SDCDataExchangeEndpointFactory import SDCDataExchangeEndpointFactory 

11from sdc_etl_libs.sdc_data_schema import schema_enums as SchemaEnums 

12from sdc_etl_libs.sdc_data_schema import schema_exceptions as SchemaExceptions 

13from sdc_etl_libs.sdc_data_schema import schema_toolbox_enums as SchemaToolboxEnums 

14from sdc_etl_libs.sdc_data_schema.schema_validation import SchemaValidation 

15from sdc_etl_libs.sdc_dataframe.SDCDataframeEnums import SDCDFTypes 

16from sdc_etl_libs.sdc_file_helpers.SDCFileHelpers import SDCFileHelpers 

17 

18 

19class SchemaToolbox: 

20 

21 @staticmethod 

22 def determine_dataframe_type_from_schema(data_schema_): 

23 """ 

24 Determines the appropriate dataframe to use (e.g. pandas, spark) based on 

25 details in the TopLevel of the data schema. 

26 :param data_schema_: Dict. Full data schema. 

27 :return: Various. Type of from SDCDFTypes. 

28 """ 

29 

30 if "estimated_row_count" in data_schema_: 

31 if isinstance(data_schema_["estimated_row_count"], int): 

32 est_row_size = data_schema_["estimated_row_count"] 

33 else: 

34 raise Exception('"estimated_row_count" in data schema is not of type int') 

35 else: 

36 raise Exception('"estimated_row_count" is missing in data schema and is required') 

37 

38 if "estimated_row_size" in data_schema_: 

39 if isinstance(data_schema_["estimated_row_size"], str): 

40 # TODO: Add regex to ensure in the right format (ex. "40b") 

41 est_row_count = data_schema_["estimated_row_size"] 

42 else: 

43 raise Exception('"estimated_row_size" in data schema is not of type str') 

44 else: 

45 raise Exception('"estimated_row_size" is missing in data schema and is required') 

46 

47 #TODO: Add methodology for figure out what dataframe type to return based on est_row_size and est_row_count 

48 # For now, will always return PANDAS. 

49 

50 return SDCDFTypes.PANDAS 

51 

52 @staticmethod 

53 def get_field_names_for_file(data_schema_): 

54 """ 

55 Parses the schema to find the field names that would likely appear in 

56 a file being processed with the schema. This is useful for when files 

57 do not have column headers, and a list needs to be generated to pass 

58 to other operations. 

59 :param data_schema_: Dict. Full data schema. 

60 :return: List. Field names as strings. 

61 """ 

62 

63 fields = [] 

64 for field in data_schema_["fields"]: 

65 # Do not include fields with a True "add_column" as this would not be 

66 # in the original file 

67 if field.get("add_column") or field.get("drop_column"): 

68 pass 

69 else: 

70 fields.append(field['name']) 

71 return fields 

72 

73 @staticmethod 

74 def generate_file_output_args(data_schema_, endpoint_schema_): 

75 """ 

76 Generates a dictionary of arguments at self.args from info contained in data schema and other functions, 

77 depending on type of endpoint being used. 

78 :param data_schema_: Dict. Full data schema. 

79 :param endpoint_schema_: Dict. Specific endpoint section data schema. 

80 :return: Dict. Arguments. 

81 """ 

82 

83 args = {} 

84 endpoint_type = endpoint_schema_["type"] 

85 file_info_type = endpoint_schema_["info"]["file_info"]["type"] 

86 file_info_opts = endpoint_schema_["info"]["file_info"]["opts"] 

87 

88 file_type = file_info_type.lower() 

89 if file_type in SchemaEnums.FileEnumMapper.__members__: 

90 

91 source_enums = SchemaEnums.FileEnumMapper.__members__[file_type].value["source"] 

92 sink_enums = SchemaEnums.FileEnumMapper.__members__[file_type].value["sink"] 

93 

94 if endpoint_type == "sink": 

95 for k, v in file_info_opts.items(): 

96 if k in sink_enums.__members__: 

97 args[sink_enums[k].value] = file_info_opts[k] 

98 args["fieldnames_"] = SchemaToolbox.get_field_names_for_file(data_schema_) 

99 

100 elif endpoint_type == "source": 

101 for k, v in file_info_opts.items(): 

102 if k in source_enums.__members__: 

103 args[source_enums[k].value] = file_info_opts[k] 

104 if "headers" in file_info_opts: 

105 if not file_info_opts["headers"]: 

106 args["names"] = SchemaToolbox.get_field_names_for_file(data_schema_) 

107 

108 return args 

109 

110 else: 

111 return None 

112 

113 @staticmethod 

114 def get_schema(schema_registry_api_url_: str = None, 

115 namespace_: str = None, 

116 name_: str = None, 

117 local_schema_path_: str = None): 

118 """ 

119 

120 """ 

121 

122 # RETRIEVING SCHEMA FIRST FROM API THEN FROM LOCAL SOURCE 

123 # schema_registry_api_url=os.getenv('schema_registry_api_url') 

124 

125 if schema_registry_api_url_: 

126 try: 

127 params = {"namespace": namespace_, "name": name_} 

128 schema_data = requests.request('GET', schema_registry_api_url_, params=params) 

129 if len(schema_data) == 0: 

130 schema_data = SchemaToolbox.get_data_schema_from_file(local_schema_path_) 

131 except Exception as e: 

132 logging.info(f'Failed to retrieve schema from the registry. Apply local schema...') 

133 try: 

134 schema_data = SchemaToolbox.get_data_schema_from_file(local_schema_path_) 

135 except Exception as e: 

136 raise BaseException(f'Failed to retrieve local schema.\n{e}') 

137 # Breaking with Error here as we cannot proceed without a schema 

138 else: 

139 try: 

140 schema_data = SchemaToolbox.get_data_schema_from_file(local_schema_path_) 

141 

142 except Exception as e: 

143 raise BaseException(f'Failed to retrieve local schema.\n{e}') 

144 # Breaking with Error here as we cannot proceed without a schema 

145 return schema_data 

146 

147 @staticmethod 

148 def get_data_schema_from_file(schema_name_): 

149 """ 

150 Returns JSON-formatted data schema from given file name. 

151 :param schema_name_: String. Name of schema file (path + name of file without extension). 

152 :return: Dict. Data schema. 

153 """ 

154 

155 try: 

156 file_name = SDCFileHelpers.get_file_path("schema", f"{schema_name_}.json") 

157 except Exception as e: 

158 raise SchemaExceptions.DataSchemaNotFound(e) 

159 

160 data_schema = json.loads(open(file_name).read()) 

161 

162 return data_schema 

163 

164 @staticmethod 

165 def get_endpoint_data_from_schema(data_schema_ : dict, endpoint_tag_, validate_=False): 

166 """ 

167 Returns the endpoint data from a data schema given an endpoint tag. 

168 :param data_schema_: Dict. Data schema to grab endpoint data from. 

169 :param endpoint_tag_: String. Tag of endpoint. 

170 :param validate_: Boolean. If True, schema validation is run and validated endpoint 

171 schema is returned. 

172 :return: Dict. Endpoint data. 

173 """ 

174 endpoint_data = None 

175 

176 for endpoint in data_schema_["endpoints"]: 

177 if endpoint["tag"] == endpoint_tag_: 

178 endpoint_data = endpoint 

179 

180 if not endpoint_data: 

181 raise SchemaExceptions.DataSchemaEndpointTagNotFound(f"{endpoint_tag_} tag not found in the data schema.") 

182 

183 if validate_: 

184 validation = SchemaValidation() 

185 endpoint_data = validation.validate_schema(data_schema_=endpoint_data, validation_type_="endpoint") 

186 

187 return endpoint_data 

188 

189 @staticmethod 

190 def get_endpoint(schema_, endpoint_tag_, **kwargs): 

191 """ 

192 Establishes connection with the specified endpoint. 

193 :param schema_: Dict. Data schema to grab endpoint data from. 

194 :param endpoint_tag_: str. Tag that identifies the schema endpoint we want to instantiate 

195 :return: SDC Endpoint 

196 """ 

197 

198 validation = SchemaValidation() 

199 validated_schema = validation.validate_schema(schema_) 

200 validated_source_endpoint_schema = SchemaToolbox.get_endpoint_data_from_schema(validated_schema, endpoint_tag_) 

201 endpoint = SDCDataExchangeEndpointFactory.get_endpoint(validated_schema, validated_source_endpoint_schema, 

202 **kwargs) 

203 return endpoint 

204 

205 @staticmethod 

206 def get_field_names_from_schema(data_schema_: dict, include_add_column_: bool = True, 

207 include_drop_column_: bool = True, use_rename_: bool = True): 

208 """ 

209 Retrieves field names from a data schema. 

210 :param data_schema_: Dict. Full SDC Data Schema. 

211 :param include_add_column_: Boolean. If True, field with "add_column" of True will be included in return results. 

212 If False, field will be ignored. 

213 :param include_drop_column_: Boolean. If True, field with "drop_column" of True will be included in return results. 

214 If False, field will be ignored. 

215 :param use_rename_: Boolean. If True, field name will be returned as the "rename" name, if applicable. If False, 

216 "name" is returned. 

217 :param return_upper_case_: Boolean. If True, all field names will be converted to uppercase after pulling. 

218 :return: List of field names from data schema. 

219 """ 

220 

221 def add_field(field_): 

222 fields.append(field_['rename']) if use_rename_ and field_.get('rename') else fields.append(field_['name']) 

223 

224 fields = [] 

225 for field in data_schema_["fields"]: 

226 if field.get("add_column"): 

227 if field.get("add_column") is True and include_add_column_: 

228 add_field(field) 

229 else: 

230 pass 

231 elif field.get("drop_column"): 

232 if field.get("drop_column") is True and include_drop_column_: 

233 add_field(field) 

234 else: 

235 pass 

236 else: 

237 add_field(field) 

238 

239 if return_upper_case_: 

240 fields = [x.upper() for x in fields] 

241 

242 return fields 

243 

244 @staticmethod 

245 def generate_sql_from_schema(schema_name_, sink_endpoint_tag_, generation_opts_=["NEW_TABLE"], validate_schema_=True): 

246 """ 

247 Generates SQL DDL statements according to the specified generation options. Utilizes static methods from 

248 SnowflakeDDLHelpers. 

249 :param schema_name_: String. Name of schema file (path + name of file without extension). 

250 :param sink_endpoint_tag_: String. Tag of sink endpoint to generate SQL code for. Note that sink endpoint type 

251 must be of a database type. 

252 :param generation_opts_: List. Options specifying SQL scripts to generate. Possible choices are outlined in 

253 SchemaToolboxEnums.SQLGenerationOptions. Defaults to "NEW_TABLE". Full list of options below: 

254 "CREATE_DATABASE" = SQL for creating a schema 

255 "CREATE_SCHEMA" = SQL for creating a schema 

256 "CREATE_SCHEMA_ROLES" = SQL for creating schema roles 

257 "CREATE_TABLE_OBJECT" = SQL for creating a table 

258 "CREATE_TABLE_ROLES" = SQL for creating table roles 

259 "CREATE_VIEW_OBJECT" = SQL for creating a view (only applicable when table has PII) 

260 "CREATE_VIEW_ROLES" = SQL for creating view roles (only applicable when table has PII) 

261 "CREATE_ALL" = SQL for creating database, schema, schema roles, table, table roles, view, and view roles 

262 "CREATE_ALL_DEV" = SQL for creating schema, table and view in the DATAENGINEERING database 

263 "CREATE_ALL_OBJECTS" = SQL for creating table and view objects 

264 "CREATE_ALL_ROLES" = SQL for creating schema roles, table roles, and view roles 

265 "CREATE_NEW_TABLE" = SQL for creating table, table roles, view, and view roles (most common option; use 

266 this when adding a new table to an existing setup with other tables 

267 "DROP_SCHEMA_ROLES" = SQL for dropping schema roles 

268 "DROP_TABLE_ROLES" = SQL for dropping table roles 

269 "DROP_VIEW_ROLES" = SQL for dropping view roles 

270 "DROP_ALL_ROLES" = SQL for dropping schema, table, and view roles 

271 :param validate_schema_: Boolean. Indicates whether the schema will be validated before SQL is generated. 

272 Defaults to True. 

273 :return: String. SQL code. 

274 """ 

275 

276 for item in generation_opts_: 

277 if item not in SchemaToolboxEnums.SQLGenerationOptions.__members__: 

278 raise Exception(f'"{item}" in parameter "generate_opts_" is not a valid SQL generation option.') 

279 

280 data_schema = SchemaToolbox.get_data_schema_from_file(schema_name_) 

281 endpoint_data = SchemaToolbox.get_endpoint_data_from_schema(data_schema, sink_endpoint_tag_) 

282 

283 if validate_schema_: 

284 validation = SchemaValidation() 

285 validated_schema = validation.validate_schema(data_schema) 

286 

287 if endpoint_data["type"] != "sink": 

288 raise Exception("SQL statements can only be generated from sink endpoints.") 

289 

290 if endpoint_data["info"]["type"] == "snowflake": 

291 

292 database = endpoint_data["info"]["access"]["database"] 

293 schema = endpoint_data["info"]["access"]["schema"] 

294 table_name = endpoint_data["info"]["access"]["table_name"] 

295 view_name = f"VW_{table_name}" 

296 style = endpoint_data["info"]["opts"].get('style', 'snowflake') 

297 role = "DATAENGINEERING" if database == 'DATAENGINEERING' else "ACCOUNTADMIN" 

298 

299 contains_pii = data_schema["contains_pii"] 

300 pii_status = "pii" if contains_pii else "no_pii" 

301 table_name_ends_with_pii = re.search("(?<!_NO)_PII$", table_name, re.IGNORECASE) 

302 table_name_ends_with_no_pii = re.search("NO_PII$", table_name, re.IGNORECASE) 

303 

304 if contains_pii and not table_name_ends_with_pii: 

305 raise Exception('"table_name" must end with "_PII" when key "contains_pii" is True.') 

306 if not contains_pii and not table_name_ends_with_no_pii: 

307 raise Exception('"table_name" must end with "_NO_PII" when key "contains_pii" is False.') 

308 

309 columns_with_data_type = [] 

310 column_masking_info = [] 

311 for field in data_schema["fields"]: 

312 if field.get("drop_column"): 

313 continue 

314 column_name = field.get("rename") or field.get("name") 

315 

316 if field["type"].get("logical_type"): 

317 column_data_type = SchemaEnums.SchemaLogicalTypeToDatabaseMapper[field["type"]["logical_type"]].value 

318 else: 

319 column_data_type = SchemaEnums.SchemaTypeToDatabaseMapper[field["type"]["type"]].value 

320 

321 if field.get("is_pii"): 

322 if field["type"].get("logical_type"): 

323 column_masked_value = SchemaEnums.SchemaTypeToDatabaseMaskingMapper[field["type"]["logical_type"]].value.format(column_name) 

324 else: 

325 column_masked_value = SchemaEnums.SchemaTypeToDatabaseMaskingMapper[field["type"]["type"]].value.format(column_name) 

326 else: 

327 column_masked_value = None 

328 

329 columns_with_data_type.append((column_name, column_data_type)) 

330 column_masking_info.append((column_name, column_masked_value)) 

331 

332 generated_sql = f'\n\n{"":-^80}\n{"GENERATED SQL":-^80}\n{"":-^80}\n\n' 

333 

334 # Database DDL 

335 if any([item in generation_opts_ for item in ["CREATE_DATABASE", "CREATE_ALL"]]): 

336 generated_sql += SnowflakeDDLHelpers.generate_create_database_script(database) 

337 

338 # Schema DDL 

339 if any([item in generation_opts_ for item in ["CREATE_SCHEMA", "CREATE_ALL", "CREATE_ALL_DEV"]]): 

340 generated_sql += SnowflakeDDLHelpers.generate_create_schema_script(database, schema, role) 

341 if any([item in generation_opts_ for item in ["CREATE_SCHEMA_ROLES", "CREATE_ALL_ROLES", "CREATE_ALL"]]): 

342 generated_sql += SnowflakeDDLHelpers.generate_create_schema_roles_script(database, schema) 

343 if any([item in generation_opts_ for item in ["DROP_SCHEMA_ROLES", "DROP_ALL_ROLES"]]): 

344 generated_sql += SnowflakeDDLHelpers.generate_drop_schema_roles_script(database, schema) 

345 

346 # TABLE DDL 

347 if any([item in generation_opts_ for item in ["CREATE_TABLE_OBJECT", "CREATE_ALL_OBJECTS", "CREATE_NEW_TABLE", "CREATE_ALL", "CREATE_ALL_DEV"]]): 

348 generated_sql += SnowflakeDDLHelpers.generate_create_table_script(database, schema, table_name, role, 

349 columns_with_data_type, style) 

350 if any([item in generation_opts_ for item in ["CREATE_TABLE_ROLES", "CREATE_ALL_ROLES", "CREATE_NEW_TABLE", "CREATE_ALL"]]): 

351 generated_sql += SnowflakeDDLHelpers.generate_create_table_roles_script(database, schema, table_name, 

352 pii_status) 

353 if any([item in generation_opts_ for item in ["DROP_TABLE_ROLES", "DROP_ALL_ROLES"]]): 

354 generated_sql += SnowflakeDDLHelpers.generate_drop_table_roles_script(database, schema, table_name, 

355 pii_status) 

356 # VIEW DDL 

357 if pii_status == "pii": 

358 if any([item in generation_opts_ for item in ["CREATE_VIEW_OBJECT", "CREATE_ALL_OBJECTS", "CREATE_NEW_TABLE", "CREATE_ALL", "CREATE_ALL_DEV"]]): 

359 generated_sql += SnowflakeDDLHelpers.generate_create_masked_view_script(database, schema, table_name, 

360 view_name, role, 

361 column_masking_info, style) 

362 if any([item in generation_opts_ for item in ["CREATE_VIEW_ROLES", "CREATE_ALL_ROLES", "CREATE_NEW_TABLE", "CREATE_ALL"]]): 

363 generated_sql += SnowflakeDDLHelpers.generate_create_view_roles_script(database, schema, 

364 view_name) 

365 if any([item in generation_opts_ for item in ["DROP_VIEW_ROLES", "DROP_ALL_ROLES"]]): 

366 generated_sql += SnowflakeDDLHelpers.generate_drop_view_roles_script(database, schema, 

367 view_name) 

368 return generated_sql 

369 

370 else: 

371 raise Exception(f'SchemaToolbox.generate_sql_from_schema currently does not support' 

372 f'database type of "{endpoint_data["info"]["type"]}"')