Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1import ast 

2import io 

3import json 

4import logging 

5import math 

6import uuid 

7from enum import Enum 

8from distutils import util 

9 

10import fastavro 

11import numpy as np 

12import pandas as pd 

13 

14from sdc_etl_libs.api_helpers import SDCAPIExceptions 

15from sdc_etl_libs.database_helpers.Database import SDCDBTypes 

16from sdc_etl_libs.database_helpers.SnowflakeDatabase import SnowflakeDatabase 

17from sdc_etl_libs.sdc_data_exchange_loggers.SDCSnowflakeLogger import SDCSnowflakeLoggerEnums as LoggerEnums 

18from sdc_etl_libs.sdc_data_schema.schema_toolbox import SchemaToolbox 

19from sdc_etl_libs.sdc_dataframe.SDCDataframeEnums import SDCDFTypes, SDCDFPandasTypes, TransformationType 

20from sdc_etl_libs.sdc_dataframe.udfs.UDFFactory import UDFFactory 

21from sdc_etl_libs.sdc_exchange_endpoints.database_endpoints.SDCColumnCleanseStylesEnums import \ 

22 StyleEnums 

23from sdc_etl_libs.sdc_file_helpers.TechnicalStandards.EDI.SDCEDIPandasConverter import \ 

24 SDCEDIPandasConverter 

25 

26 

27# List of dataframe types that should not be double-quoted when generating 

28# SQL queries (as they are not string values). 

29SDCDFNonQuotableTypes = [bool, np.bool_, np.float32, np.float64, pd.Int32Dtype, pd.Int64Dtype] 

30SDCDFNonQuotableFields = [] 

31 

32 

33class Dataframe(object): 

34 

35 def __init__(self, sdcdftype_, schema_json_): 

36 self.df = None 

37 self.schema_json = None 

38 self.df_schema = [] 

39 self.type = None 

40 self.add_columns = {} 

41 self.nullable_columns = {} 

42 self.datetime_columns = {} 

43 self.default_column_values = {} 

44 self.json_columns = {} 

45 self.array_columns = {} 

46 self.rename_columns = [] 

47 self.columns_to_drop = [] 

48 self.insert_query_ddl = None 

49 self.shape = (0, 0) 

50 if type(schema_json_) == str: 

51 self.schema_json = json.loads(schema_json_) 

52 elif type(schema_json_) == dict: 

53 self.schema_json = schema_json_ 

54 else: 

55 raise Exception("Bad schema passed into dataframe.") 

56 

57 if sdcdftype_ is None: 

58 self.type = SchemaToolbox.determine_dataframe_type_from_schema(self.schema_json) 

59 else: 

60 self.type = sdcdftype_ 

61 

62 self.database_table_name = None 

63 

64 self.database_schema_name = None 

65 

66 #TODO this maybe leveraged for style application to columns in schema 

67 # self.sink = [x for x in self.schema_json.get('endpoints') if x.get('type') == 'sink'] 

68 

69 self.generate_schema() 

70 self.__history = [] 

71 self.chunksize = 15000 

72 

73 @staticmethod 

74 def generate_default_data_for_column(type_, row_count_, df_index_): 

75 """ 

76 Generates a pandas Series of values with certain types. Can then be 

77 used to update values in a dataframe. 

78 :param type_: Desired value type. If "datetime", will generate datetime 

79 values with utcnow. For other types, name will correlate the the value 

80 in enum SDCDFPandasTypes (if supported). 

81 :param row_count_: Int. Size to make series. 

82 :param df_index_: Index of the pandas dataframe that data will be generated for (pd.Index(['2', '3'...]). 

83 Provide this so that when the generated data is applied to the dataframe, the index of the Series 

84 lines up to the Dataframe's index and values merge appropriately. 

85 :return: Pandas Series. 

86 """ 

87 

88 if type_ in SDCDFPandasTypes.__members__: 

89 if type_ == "datetime": 

90 return pd.Series([pd.Timestamp.utcnow().tz_localize(None).to_datetime64() for i in range(row_count_)], 

91 index=df_index_) 

92 elif type_ in ["null"]: 

93 return pd.Series([SDCDFPandasTypes[type_].value for i in range(row_count_)], index=df_index_) 

94 else: 

95 raise Exception(f"Default value for type {type_} not supported.") 

96 else: 

97 raise Exception(f"No defaults available for {type_}") 

98 

99 def generate_merge_query(self, table_name_, merge_table_name_, force_merge_=False): 

100 """ 

101 Generates merge query code. 

102 :param table_name_: Snowflake table name to merge into. 

103 :param merge_table_name_: Snowflake table name to merge from. 

104 :param force_merge_: Boolean. If True, a merge with be performed (regardless 

105 if upsert==False in schema) using all columns in the dataframe - 

106 Except for the ETL insert datetime column - as merge keys. 

107 :return: String of merge query code. 

108 """ 

109 

110 def find_merge_keys(fields_): 

111 merge_keys = [] 

112 for key in fields_: 

113 if "sf_merge_key" in key and key["sf_merge_key"] is True: 

114 if "rename" in key: 

115 merge_keys.append(key["rename"]) 

116 else: 

117 merge_keys.append(key["name"]) 

118 

119 if len(merge_keys) == 0: 

120 raise Exception("No Merge Keys were found in schema. Merge keys are required for UPSERT.") 

121 else: 

122 return merge_keys 

123 

124 if force_merge_: 

125 merge_keys = [ 

126 item["name"] 

127 for item in self.schema_json["fields"] 

128 if item["name"] != LoggerEnums.RawTableColumns.ETL_INSERT_DATETIME_COLUMN.value 

129 ] 

130 else: 

131 merge_keys = find_merge_keys(self.schema_json["fields"]) 

132 

133 columns = [] 

134 for field in self.schema_json["fields"]: 

135 # Remove columns from merge query that we are dropping from dataframe 

136 if "drop_column" in field and field["drop_column"]: 

137 logging.info(f'Column {field["name"]} removed from merge query.') 

138 

139 else: 

140 if "rename" in field: 

141 columns.append(field["rename"]) 

142 else: 

143 columns.append(field["name"]) 

144 

145 query = SnowflakeDatabase.generate_merge_query(self.database_name, self.database_schema_name, table_name_, 

146 merge_table_name_, columns, merge_keys) 

147 

148 return query 

149 

150 def convert_columns_to_json(self, columns): 

151 """ 

152 Converts columns in a JSON-like format into useable JSON. Will replace 

153 single-quoted keys/values with double-quotes and ensure boolean 

154 values are in proper format for Snowflake JSON parsing. 

155 :return: None 

156 """ 

157 

158 logging.info(f"Converting columns to JSON...") 

159 

160 for column in columns: 

161 try: 

162 self.df[column] = self.df[column].apply(lambda x: x.replace('\\n', '\\\\n').replace('\\r', '\\\\r') 

163 if isinstance(x, str) else x) 

164 

165 self.df[column] = self.df[column] \ 

166 if self.df[column].isnull().all() else self.df[column].str.replace(r':\s*(false|FALSE)', ': False', 

167 regex=True) 

168 self.df[column] = self.df[column] \ 

169 if self.df[column].isnull().all() else self.df[column].str.replace(r':\s*(true|TRUE)', ': True', 

170 regex=True) 

171 

172 self.df[column] = self.df[column] \ 

173 if self.df[column].isnull().all() else self.df[column].str.replace(r':\s*null', ': None', 

174 regex=True) 

175 

176 self.df[column] = self.df[column].apply(lambda x: ast.literal_eval(x) if isinstance(x, list) or pd.notnull( 

177 x) else x).apply(lambda x: json.dumps(x) if isinstance(x, list) or pd.notnull(x) else x) 

178 

179 except Exception as e: 

180 logging.error(e) 

181 logging.error(f"failed on column {column}") 

182 

183 logging.info(f"Converting columns to JSON complete...") 

184 

185 def generate_insert_query_ddl(self): 

186 """ 

187 Generates the DDL portion of an insert query based off of the dataframe 

188 data. Wraps JSON columns into appropriate JSON formatting functions. 

189 Example result: 

190 (START_DATE, 

191 EMPLOYEE_NUMBER, 

192 EMPLOYEE_DATA) 

193 select 

194 column1 as "START_DATE", 

195 column2 as "EMPLOYEE_NUMBER", 

196 PARSE_JSON(column3) as "EMPLOYEE_DATA" 

197 from values 

198 :return: DLL portion of query string. 

199 """ 

200 

201 # The part of the query that lists all the columns to be inserted in order. 

202 query_column_part = ', '.join(f'"{x}"' for x in self.df.columns) 

203 

204 # The part of the query that handles the transformations of the columns 

205 # (if any), including adding PARSE_JSON() if JSON. 

206 query_select_part = "" 

207 count = 1 

208 for col in self.df.columns: 

209 if col in self.json_columns or col in self.array_columns: 

210 q = f'PARSE_JSON(Column{count}) as "{col}", ' 

211 else: 

212 q = f'Column{count} as "{col}", ' 

213 query_select_part += q 

214 count += 1 

215 

216 query = f'({query_column_part}) select {query_select_part[:-2]} from values ' 

217 

218 return query 

219 

220 def generate_insert_query_values(self, index_start_, index_end_, use_string_demiliter_=False): 

221 """ 

222 Generates the values portion of an insert query based off of the Dataframe 

223 data. Ensures values that should not be quoted as strings (numbers, boolean, 

224 etc.) aren't, and that None/Empty values in Dataframe are converted to NULL 

225 for inserting into table. 

226 Example result: 

227 ('2019-09-10', 5, {'EmployeeName': 'D. Engineer', 'EmployeeCode': 10}), 

228 ('2019-10-10', 6, {'EmployeeName': 'D. Analyst', 'EmployeeCode': 12}), 

229 ('2019-09-25', 9, {'EmployeeName': 'Mr. E', 'EmployeeCode': 15}) 

230 :param index_start_: Starting index number of dataframe 

231 :param index_end_: Ending index number of dataframe 

232 :param use_string_delimiter_: Boolean. Indicates if $$ will be used as string delimiter. 

233 :return: DLL portion of query string. 

234 """ 

235 

236 # Generate list of columns where the value should not be in quotes in 

237 # the insert statement. This include numbers and boolean values. 

238 non_quotable_columns = [] 

239 for index, value in self.df.dtypes.iteritems(): 

240 if type(value) in SDCDFNonQuotableTypes: 

241 non_quotable_columns.append(index) 

242 elif isinstance(value,np.dtype) and index in SDCDFNonQuotableFields: 

243 non_quotable_columns.append(index) 

244 #HOW WE CAN HAVE value == bool? 

245 elif value == bool: 

246 non_quotable_columns.append(index) 

247 

248 # The part of the query that lists the actual values to be inserted. 

249 # Converts any python None types into NULL for SQL consumption. 

250 query_value_part = "" 

251 for index, row in self.df[index_start_:index_end_].iterrows(): 

252 count = 0 

253 line = "(" 

254 for rec in row: 

255 if count < len(self.df.columns): 

256 # Convert null values to string 'NULL' for query 

257 if pd.isnull(rec): 

258 line += f"NULL, " 

259 else: 

260 if self.df.columns[count] in non_quotable_columns: 

261 line += f"{rec}, " 

262 else: 

263 line += f"$${rec}$$, " if use_string_demiliter_ else f"'{rec}', " 

264 count += 1 

265 query_value_part += line[:-2] + "), " 

266 

267 return query_value_part 

268 

269 def generate_complete_insert_query(self, index_start_, index_end_, table_uuid_=None, use_string_delimiter_=False): 

270 """ 

271 Given the ddl and values part of the insert query, constructs the entire 

272 insert query needed to write to the database. 

273 insert into "COOLDATA"."AWESOME_TABLE" 

274 (START_DATE, 

275 END_DATE, 

276 EMPLOYEE_DATA) 

277 select 

278 column1 as "START_DATE", 

279 column2 as "END_DATE", 

280 PARSE_JSON(column3) as "EMPLOYEE_DATA" 

281 from values 

282 ('2019-09-10', 5, {'EmployeeName': 'D. Engineer', 'EmployeeCode': 10}), 

283 ('2019-10-10', 6, {'EmployeeName': 'D. Analyst', 'EmployeeCode': 12}), 

284 ('2019-09-25', 9, {'EmployeeName': 'Mr. E', 'EmployeeCode': 15}) 

285 :param index_start_: Starting index number of dataframe 

286 :param index_end_: Ending index number of dataframe 

287 :param table_uuid_: String. UUID name to use in generation of temp 

288 table (for merging) 

289 :param use_string_delimiter_: Boolean. Indicates if $$ will be used as string delimiter. 

290 :return: Complete insert query as string. 

291 """ 

292 

293 if not self.insert_query_ddl: 

294 query_ddl_part = self.generate_insert_query_ddl() 

295 

296 query_value_part = self.generate_insert_query_values(index_start_, index_end_, use_string_delimiter_) 

297 

298 if table_uuid_: 

299 table_name = self.database_table_name + f"_TEMP_{table_uuid_}" 

300 else: 

301 table_name = self.database_table_name 

302 

303 complete_query = f'insert into "{self.database_schema_name}".' \ 

304 f'"{table_name}" {query_ddl_part} {query_value_part[:-2]};' 

305 

306 return complete_query 

307 

308 def get_dataframe_schema(self, type_): 

309 ''' 

310 @param type_: Schema type being requested (e.g. json, avro) 

311 ''' 

312 

313 pass 

314 

315 def generate_schema(self): 

316 """ 

317 This function will generate the proper schema for the dataframe. 

318 :return: None 

319 """ 

320 column_type = None 

321 if "fields" not in self.schema_json: 

322 raise Exception("Malformed schema") 

323 if self.type == SDCDFTypes.PANDAS: 

324 for column in self.schema_json["fields"]: 

325 if "name" not in column or "type" not in column or "type" not \ 

326 in \ 

327 column["type"]: 

328 raise Exception("Malformed schema") 

329 elif "logical_type" in column["type"]: 

330 if column["type"]["logical_type"] not in \ 

331 SDCDFPandasTypes.__members__: 

332 raise Exception(f"Unknown logical_type " f"{column['type']['logical_type']}") 

333 else: 

334 column_type = { 

335 f"{column['name'].upper()}": SDCDFPandasTypes[column["type"]["logical_type"]].value 

336 } 

337 self.df_schema.append(column_type) 

338 else: 

339 column_type = {f"{column['name'].upper()}": SDCDFPandasTypes[column["type"]["type"]].value} 

340 self.df_schema.append(column_type) 

341 

342 if "drop_column" in column and column["drop_column"]: 

343 self.columns_to_drop.append(column["name"].upper().rstrip().lstrip()) 

344 continue 

345 

346 if "add_column" in column and column["add_column"]: 

347 key = list(column_type.keys())[0] 

348 self.add_columns[key] = column_type[key] 

349 

350 if "is_nullable" in column and column["is_nullable"]: 

351 key = list(column_type.keys())[0] 

352 self.nullable_columns[key] = column_type[key] 

353 

354 if "logical_type" in column["type"] \ 

355 and column["type"]["logical_type"] == "datetime": 

356 key = list(column_type.keys())[0] 

357 self.datetime_columns[key] = column_type[key] 

358 

359 if "logical_type" in column["type"] \ 

360 and column["type"]["logical_type"] == 'json': 

361 key, col_type = self.handle_column_key(column, column_type) 

362 self.json_columns[key] = col_type 

363 

364 if "logical_type" in column["type"] \ 

365 and column["type"]["logical_type"] == 'array': 

366 key, col_type = self.handle_column_key(column, column_type) 

367 self.array_columns[key] = col_type 

368 

369 if "rename" in column: 

370 self.rename_columns.append({column["name"].upper(): column["rename"].upper()}) 

371 

372 if "default_value" in column and column['default_value'] is not None: 

373 key = column["rename"].upper() if "rename" in column else column["name"].upper() 

374 value = column['default_value'] 

375 self.default_column_values[key] = value 

376 

377 elif self.type == SDCDFTypes.SPARK: 

378 return None 

379 

380 else: 

381 raise Exception("Unknown dataframe type") 

382 

383 def handle_column_key(self, column, column_type): 

384 """ 

385 Updates key if column rename 

386 :param column_type: Df Column Type 

387 :param column: Df column 

388 :return: column key and Type. 

389 """ 

390 key = list(column_type.keys())[0] 

391 col_type = column_type[key] 

392 if "rename" in column: 

393 key = column["rename"].upper() 

394 return key, col_type 

395 

396 def process_df(self, df_): 

397 """ 

398 Processes a dataframe. 

399 :param df_: Dataframe to process. 

400 :return: None. 

401 """ 

402 if self.type == SDCDFTypes.PANDAS: 

403 

404 def handle_datetime(dt_): 

405 if dt_ is None or (isinstance(dt_, str) and len(dt_) == 0): 

406 return pd.NaT 

407 p = pd.to_datetime(dt_, errors='coerce') 

408 

409 if p == pd.NaT: 

410 return p 

411 else: 

412 return p.to_datetime64() 

413 

414 self.df = df_ 

415 

416 #TODO implement some logic here or outside of Dataframe.py to drive column styling 

417 # currently it makes all characters UPPER case, which not gonna work for case-sensetive 

418 # destinations 

419 # if self.sink.get('info', {}).get('type') != 'dynamodb': 

420 # style = StyleEnums['DEFAULT_DF'] 

421 # self.cleanse_column_names(style_=style) 

422 self.df.columns = [str(x).upper().rstrip().lstrip() for x in df_.columns] 

423 

424 # Replace and None or na types to NAN 

425 self.df = self.df.fillna(value=np.nan) 

426 # Replace empty strings with NAN 

427 self.df = self.df.replace(r'^\s*$', np.nan, regex=True) 

428 

429 # Performs dataframe level transforms 

430 # Need to check for aggregations vs normalizations vs modifiers 

431 self.perform_dataframe_transformations(TransformationType.pre) 

432 

433 # Performs columns level transforms - PRE (Initial transformations) 

434 self.perform_column_transformations(TransformationType.pre) 

435 

436 column_names = [] 

437 for datatype in self.df_schema: 

438 current_col = datatype 

439 

440 key = list(datatype.keys())[0] 

441 column_names.append(key) 

442 if key not in self.df.columns and key not in \ 

443 self.add_columns and key not in self.nullable_columns: 

444 raise Exception(f"Column: {datatype} not in dataframe: " f"{self.df.columns}") 

445 elif key in self.add_columns and key not in self.df.columns: 

446 # TODO: @dataframeRefactor: #1 Add a condition when the key is '_ETL_FILENAME'. Whe should not add 

447 # empty or datetime values (as its happening right now), the value must be the name of the file 

448 # that we are processing. 

449 if key in self.default_column_values: 

450 self.df[key] = Dataframe.generate_default_data_for_column("null", self.df.shape[0], 

451 self.df.index) 

452 else: 

453 # TODO: @dataframeRefactor: #2 The 'type_' value sent to the method ' 

454 # generate_default_data_for_column()' should be the type of the Column set in the schema, 

455 # not 'datetime' by default. 

456 self.df[key] = Dataframe.generate_default_data_for_column("datetime", self.df.shape[0], 

457 self.df.index) 

458 elif key in self.nullable_columns and key not in self.df.columns: 

459 self.df[key] = Dataframe.generate_default_data_for_column("null", self.df.shape[0], self.df.index) 

460 if datatype[key] == SDCDFPandasTypes.datetime.value: 

461 self.df[key] = self.df[key].map(handle_datetime) 

462 elif datatype[key] == SDCDFPandasTypes.array.value: 

463 self.df[key] = self.df[key].astype(datatype) 

464 else: 

465 self.df[key] = self.df[key].astype(datatype) 

466 elif datatype[key] == SDCDFPandasTypes.datetime.value: 

467 self.df[key] = self.df[key].map(handle_datetime) 

468 elif datatype[key] == SDCDFPandasTypes.string.value: 

469 self.df[key] = self.df[key].map(lambda x: str(x) if not isinstance(x, type(np.nan)) else x) 

470 self.df[key] = self.df[key].astype(datatype) 

471 elif datatype[key] == SDCDFPandasTypes.int.value \ 

472 or datatype[key] == SDCDFPandasTypes.long.value: 

473 self.df[key] = pd.to_numeric(self.df[key]) 

474 self.df[key] = self.df[key].astype(datatype) 

475 elif datatype[key] == SDCDFPandasTypes.array.value: 

476 self.df[key] = self.df[key].astype(datatype) 

477 elif datatype[key] == SDCDFPandasTypes.boolean.value: 

478 self.df[key] = self.df[key].map(lambda x: bool(util.strtobool(str(x))) if not isinstance(x, type(np.nan)) else x) 

479 self.df[key] = self.df[key].astype(np.object_) 

480 SDCDFNonQuotableFields.append(key) 

481 else: 

482 self.df[key] = self.df[key].astype(datatype) 

483 

484 for column in self.df.columns: 

485 if column not in column_names: 

486 logging.error(f"Dropping Unknown Column: {str(column)}") 

487 self.df = self.df.drop(column, axis=1) 

488 if column in self.columns_to_drop: 

489 logging.error(f"Dropping Column: {str(column)}") 

490 self.df = self.df.drop(column, axis=1) 

491 

492 for column_rename in self.rename_columns: 

493 self.df.rename(columns=column_rename, inplace=True) 

494 

495 # Performs columns level transforms - POST (Final transformations) 

496 # TODO: Validate the right place to apply the POST transformations 

497 # self.perform_dataframe_transformations(TransformationType.post) 

498 # self.perform_column_transformations(TransformationType.post) 

499 

500 # reset shape after all the processing. 

501 self.shape = self.df.shape 

502 

503 elif self.type == SDCDFTypes.SPARK: 

504 return None 

505 

506 else: 

507 raise Exception("Unknown dataframe type") 

508 

509 for column, default_value in self.default_column_values.items(): 

510 self.df[column] = self.df[column].fillna(value=default_value) 

511 

512 def load_data(self, data_list_): 

513 """ 

514 This function take in a list of flattened dictionaries. 

515 Nested structures not supported. 

516 :param data_list_: 

517 :return: 

518 """ 

519 

520 if self.type == SDCDFTypes.PANDAS: 

521 current_col = None 

522 

523 try: 

524 self.df = pd.DataFrame(data_list_) 

525 self.process_df(self.df) 

526 self.shape = self.df.shape 

527 

528 except Exception as e: 

529 logging.error(e) 

530 logging.error(f"failed on column {current_col}") 

531 # logging.error(f"Data: {data_list_}") 

532 # raise Exception(f"Unable to read data with schema {self.df_schema}") 

533 raise SDCAPIExceptions.BadSchema(f"Unable to read data with schema {self.df_schema}") 

534 

535 elif self.type == SDCDFTypes.SPARK: 

536 raise Exception("Spark dataframes unsupported currently.") 

537 

538 def read_from_parquet(self, file_obj_): 

539 """ 

540 This function will take a parquet file like object and load it into a dataframe. 

541 Useful for when you are reading parquet files into a SDCDF. 

542 :param file_obj_: File object to read from. 

543 :return: None 

544 """ 

545 

546 if self.type == SDCDFTypes.PANDAS: 

547 pd_df = pd.read_parquet(file_obj_, engine='auto') 

548 self.process_df(pd_df) 

549 

550 elif self.type == SDCDFTypes.SPARK: 

551 raise Exception("Spark not currently supported") 

552 else: 

553 raise Exception("Invalid dataframe type to read parquet.") 

554 

555 def read_from_avro(self, file_obj_): 

556 """ 

557 This function will take an avro file like object and load it into a dataframe. 

558 Useful for when you are reading avro files into a SDCDF. 

559 :param file_obj_: File object to read from. 

560 :return: None 

561 """ 

562 

563 import fastavro 

564 

565 if self.type == SDCDFTypes.PANDAS: 

566 reader = fastavro.reader(file_obj_) 

567 records = [r for r in reader] 

568 pd_df = pd.DataFrame.from_records(records) 

569 self.process_df(pd_df) 

570 

571 elif self.type == SDCDFTypes.SPARK: 

572 raise Exception("Spark not currently supported") 

573 else: 

574 raise Exception("Invalid dataframe type to read avro.") 

575 

576 def sdcdf_to_sql(self, sdc_database_handle_, chunksize_=15000, table_uuid_=None, use_string_delimiter_=False): 

577 """ 

578 Iterates over dataframe, chunks and writes data to database. 

579 :param sdc_database_handle_: Database handle. 

580 :param chunksize_: Number of records to insert into database at one 

581 time. Default = 15,000. Note: Chunksize will be reset to the 

582 database-specific limit if it's more than that value. Limits: 

583 Snowflake = 15,000 

584 :param table_uuid_: UUID for TEMP table generation (if upserting). 

585 Default = None 

586 :param use_string_delimiter_: Boolean. Indicates if $$ will be used as string delimiter. 

587 :return: None 

588 """ 

589 

590 if self.json_columns: 

591 self.convert_columns_to_json(self.json_columns) 

592 

593 if self.array_columns: 

594 self.convert_columns_to_json(self.array_columns) 

595 

596 if self.type == SDCDFTypes.PANDAS: 

597 if sdc_database_handle_.type == SDCDBTypes.SNOWFLAKE: 

598 

599 total_records = len(self.df) 

600 if total_records > 0: 

601 total_index = total_records - 1 

602 index_start = 0 

603 chunksize = chunksize_ if chunksize_ < 15000 else 15000 

604 

605 while (index_start < total_index) or index_start == 0: 

606 index_end = index_start + chunksize 

607 logging.info(f"Attempting to insert Dataframe records " 

608 f"into Snowflake. {index_start + 1:,} to " 

609 f"{min(index_end, total_records):,} " 

610 f"(out of {total_records:,})") 

611 insert_query = self.generate_complete_insert_query(index_start, index_end, table_uuid_, 

612 use_string_delimiter_) 

613 sdc_database_handle_.execute_query(insert_query, return_results_=True) 

614 index_start += chunksize 

615 

616 def __send_df_to_sql(self, 

617 table_name_, 

618 sql_con_eng_, 

619 schema_name_, 

620 if_exists='append', 

621 index=False, 

622 chunksize=15000): 

623 """ 

624 This is a wrapper function to df.to_sql which reduces chunksize whenever 

625 "out of memory" error is thrown by SQL endpoint 

626 :param table_name_: String. Name of the database table you want to write to. 

627 :param sql_con_eng_: SnowflakeDatabase object. Database connection/cursor object 

628 :param schema_name_: String. Name of the schema in the database we are writing to. 

629 :param if_exists: How to behave if the table already exists. 

630 fail: Raise a ValueError. 

631 replace: Drop the table before inserting new values. 

632 append: Insert new values to the existing table. 

633 :param index: Write DataFrame index as a column. 

634 :param chunksize: Specify the number of rows in each batch to be written at a time. 

635 By default, all rows will be written at once. 

636 :return: None 

637 :raises Exception: 

638 """ 

639 logging.info('Sending DF to SQL. Chunksize: %d', chunksize) 

640 try: 

641 self.df.to_sql( 

642 name=table_name_, 

643 con=sql_con_eng_, 

644 schema=schema_name_, 

645 if_exists=if_exists, 

646 index=index, 

647 chunksize=chunksize) 

648 except Exception as e: 

649 if e.orig.errno == 1042: # 1042 is Snowflake compilation memory exhausted error code 

650 logging.info('%s\nCutting "chunksize" in half', e.orig.raw_msg) 

651 chunksize = math.ceil(chunksize / 2) 

652 self.__send_df_to_sql( 

653 table_name_, sql_con_eng_, schema_name_, if_exists='append', index=False, chunksize=chunksize) 

654 else: 

655 raise 

656 

657 def write_dataframe_to_database(self, 

658 sdc_database_handle_, 

659 table_name_, 

660 schema_name_, 

661 upsert_=False, 

662 dedupe_=False, 

663 logger_=None, 

664 force_merge_=False, 

665 item_uuid_=None, 

666 use_string_delimiter_=False): 

667 """ 

668 Writes data from an SDCDataframe to a database. 

669 

670 :param sdc_database_handle_: SnowflakeDatabase object. Database connection/cursor object 

671 :param table_name_: String. Name of the database table you want to write to. 

672 :param schema_name_: String. Name of the schema in the database we are writing to. 

673 :param upsert_: Boolean. If True, upsert/merge is performed on data. If False, 

674 data will be inserted into table. 

675 :param dedupe_: Boolean. If True, data used for an upsert/merge will first be deduped via 

676 a DISTINCT * query into an secondary temp table. 

677 :param logger_: SDCLogger object. Logger used to log ETL results. 

678 :param force_merge_: Boolean. If True, a merge with be performed (regardless 

679 if upsert==False in schema) using all columns in the dataframe - 

680 Except for the ETL insert datetime column - as merge keys. 

681 :param use_string_delimiter_: Boolean. Indicates if $$ will be used as string delimiter. 

682 :return: String. Message about write results. 

683 """ 

684 

685 self.database_table_name = table_name_ 

686 self.database_schema_name = schema_name_ 

687 self.database_name = sdc_database_handle_.database 

688 

689 r, c = self.df.shape 

690 

691 if logger_: 

692 logger_.write_stats = {"TYPE": "snowflake", "STATS": []} 

693 write_stats_count = 0 

694 

695 logging.info(f"Shape of dataframe: {self.df.shape}") 

696 if r == 0: 

697 raise Exception("No Data to write.") 

698 

699 if self.type == SDCDFTypes.PANDAS: 

700 if sdc_database_handle_.type == SDCDBTypes.SNOWFLAKE: 

701 if not sdc_database_handle_.sqlalchemy: 

702 raise Exception("Snowflake handle without SQLALCHEMY=true not supported.") 

703 

704 if force_merge_: 

705 upsert_ = True 

706 dedupe_ = True 

707 sdc_database_handle_.execute_query("ALTER SESSION SET ERROR_ON_NONDETERMINISTIC_MERGE = FALSE;") 

708 

709 try: 

710 

711 if logger_: 

712 sdc_database_handle_.transaction_begin() 

713 base_stats_start = sdc_database_handle_.get_table_size(self.database_name, 

714 self.database_schema_name, 

715 self.database_table_name) 

716 

717 if upsert_: 

718 logging.info("Attempting upsert...") 

719 table_uuid = str(uuid.uuid4()).replace("-", "_") 

720 temp_table_name = f"{self.database_table_name}_TEMP_{table_uuid}" 

721 

722 # Create a empty temp table with same schema as main table 

723 create_temp_table_clause = ( 

724 f"CREATE TABLE \"{self.database_schema_name}\".\"{temp_table_name}\" " 

725 f"LIKE \"{self.database_schema_name}\".\"{self.database_table_name}\";") 

726 sdc_database_handle_.execute_query(create_temp_table_clause) 

727 

728 # Load data into TEMP table 

729 sdc_database_handle_.transaction_begin() 

730 if self.json_columns: 

731 self.sdcdf_to_sql( 

732 sdc_database_handle_=sdc_database_handle_, 

733 table_uuid_=table_uuid, 

734 use_string_delimiter_=use_string_delimiter_) 

735 else: 

736 self.__send_df_to_sql( 

737 f"{temp_table_name}", 

738 sdc_database_handle_.connection, 

739 schema_name_=self.database_schema_name, 

740 if_exists='append', 

741 index=False, 

742 chunksize=self.chunksize) 

743 

744 # If deduping, create deduped temp table, and generate merge query 

745 if dedupe_: 

746 deduped_table_name = f"{temp_table_name}_DEDUPED" 

747 

748 logging.info("Generating deduped table...") 

749 sdc_database_handle_.create_deduped_table(self.database_name, self.database_schema_name, 

750 temp_table_name, deduped_table_name) 

751 

752 merge_query = self.generate_merge_query(self.database_table_name, deduped_table_name, 

753 force_merge_) 

754 

755 else: 

756 merge_query = self.generate_merge_query(self.database_table_name, temp_table_name, 

757 force_merge_) 

758 

759 if logger_: 

760 if dedupe_: 

761 records_to_write = sdc_database_handle_.get_table_size( 

762 self.database_name, self.database_schema_name, deduped_table_name)["RECORDS"] 

763 elif upsert_: 

764 records_to_write = sdc_database_handle_.get_table_size( 

765 self.database_name, self.database_schema_name, temp_table_name)["RECORDS"] 

766 logger_.update_base_table({ 

767 "UUID": item_uuid_, 

768 "TOTAL_RECORDS_WRITTEN": 0, 

769 "TOTAL_RECORDS_TO_WRITE": records_to_write, 

770 "CURRENT_EXCHANGE_STATUS": LoggerEnums.ExchangeStatuses.PROCESSING.value, 

771 "CURRENT_EXCHANGE_REASON": LoggerEnums.ExchangeStatusReasons.SYNCING.value, 

772 "EXCHANGE_MESSAGE": f"Total records to write is {records_to_write:,}" 

773 }) 

774 

775 # Merge results from temp table into main table 

776 logging.info("Merging TEMP table into main table...") 

777 if logger_: 

778 sdc_database_handle_.transaction_begin() 

779 base_stats_start = sdc_database_handle_.get_table_size(self.database_name, 

780 self.database_schema_name, 

781 self.database_table_name) 

782 sdc_database_handle_.execute_query(merge_query, return_results_=True) 

783 

784 # Generate results for records insert/upserted 

785 for row in sdc_database_handle_.get_results(): 

786 db_inserted = row[0] 

787 db_upserted = row[1] 

788 

789 else: 

790 if logger_: 

791 logger_.update_base_table({ 

792 "UUID": item_uuid_, 

793 "TOTAL_RECORDS_TO_WRITE": r, 

794 "CURRENT_EXCHANGE_STATUS": LoggerEnums.ExchangeStatuses.PROCESSING.value, 

795 "CURRENT_EXCHANGE_REASON": LoggerEnums.ExchangeStatusReasons.SYNCING.value, 

796 "EXCHANGE_MESSAGE": f"Total records to write is {r:,}" 

797 }) 

798 logging.info("Writing to table...") 

799 if self.json_columns: 

800 self.sdcdf_to_sql( 

801 sdc_database_handle_=sdc_database_handle_, use_string_delimiter_=use_string_delimiter_) 

802 

803 else: 

804 self.__send_df_to_sql( 

805 table_name_, 

806 sdc_database_handle_.connection, 

807 schema_name_, 

808 if_exists='append', 

809 index=False, 

810 chunksize=self.chunksize) 

811 

812 db_inserted = r 

813 db_upserted = 0 

814 

815 return f"{db_inserted:,} row(s) inserted. {db_upserted:,} row(s) upserted." 

816 

817 except Exception as e: 

818 logging.exception(f"There was an error trying to write the dataframe to Snowflake. {e}.") 

819 raise 

820 

821 finally: 

822 # Log sizes of TEMP table creations and size change to main table. 

823 if logger_: 

824 logging.info("Logging table size changes...") 

825 if upsert_: 

826 temp_stats = sdc_database_handle_.get_table_size(self.database_name, 

827 self.database_schema_name, temp_table_name) 

828 write_stats_count += 1 

829 logger_.write_stats["STATS"].append({ 

830 "DEST_TYPE": LoggerEnums.SnowflakeWritesDestTypes.TEMP.value, 

831 "OPERATION": LoggerEnums.SnowflakeWritesOperations.CREATE.value, 

832 "TABLE_NAME": temp_table_name, 

833 "WRITE_STEP": write_stats_count, 

834 "RECORDS_INSERTED": temp_stats["RECORDS"], 

835 "RECORDS_UPSERTED": 0, 

836 "TOTAL_RECORDS": temp_stats["RECORDS"], 

837 "INSERT_SIZE_BYTES": temp_stats["BYTES"] 

838 }) 

839 

840 if dedupe_: 

841 dedupe_stats = sdc_database_handle_.get_table_size(self.database_name, 

842 self.database_schema_name, 

843 deduped_table_name) 

844 write_stats_count += 1 

845 logger_.write_stats["STATS"].append({ 

846 "DEST_TYPE": LoggerEnums.SnowflakeWritesDestTypes.DEDUPED.value, 

847 "OPERATION": LoggerEnums.SnowflakeWritesOperations.CREATE.value, 

848 "TABLE_NAME": deduped_table_name, 

849 "WRITE_STEP": write_stats_count, 

850 "RECORDS_INSERTED": dedupe_stats["RECORDS"], 

851 "RECORDS_UPSERTED": 0, 

852 "TOTAL_RECORDS": dedupe_stats["RECORDS"], 

853 "INSERT_SIZE_BYTES": dedupe_stats["BYTES"] 

854 }) 

855 

856 base_stats_end = sdc_database_handle_.get_table_size(self.database_name, 

857 self.database_schema_name, 

858 self.database_table_name) 

859 write_stats_count += 1 

860 logger_.write_stats["STATS"].append({ 

861 "DEST_TYPE": 

862 LoggerEnums.SnowflakeWritesDestTypes.MAIN.value, 

863 "OPERATION": 

864 LoggerEnums.SnowflakeWritesOperations.UPSERT.value 

865 if dedupe_ or upsert_ else LoggerEnums.SnowflakeWritesOperations.INSERT.value, 

866 "TABLE_NAME": 

867 self.database_table_name, 

868 "WRITE_STEP": 

869 write_stats_count, 

870 "RECORDS_INSERTED": 

871 db_inserted, 

872 "RECORDS_UPSERTED": 

873 db_upserted, 

874 "TOTAL_RECORDS": 

875 db_inserted + db_upserted, 

876 "INSERT_SIZE_BYTES": 

877 base_stats_end["BYTES"] - base_stats_start["BYTES"] 

878 }) 

879 

880 if force_merge_: 

881 sdc_database_handle_.execute_query( 

882 "ALTER SESSION SET ERROR_ON_NONDETERMINISTIC_MERGE = TRUE;") 

883 

884 # Drop all temp tables 

885 if upsert_: 

886 logging.info("Dropping TEMP table...") 

887 temp_stats = sdc_database_handle_.get_table_size(self.database_name, self.database_schema_name, 

888 temp_table_name) 

889 drop_temp_table_clause = f"DROP TABLE IF EXISTS \"{self.database_schema_name}\".\"{temp_table_name}\"" 

890 sdc_database_handle_.execute_query(drop_temp_table_clause, return_results_=True) 

891 if logger_: 

892 write_stats_count += 1 

893 logger_.write_stats["STATS"].append({ 

894 "DEST_TYPE": LoggerEnums.SnowflakeWritesDestTypes.TEMP.value, 

895 "OPERATION": LoggerEnums.SnowflakeWritesOperations.DROP.value, 

896 "TABLE_NAME": temp_table_name, 

897 "WRITE_STEP": write_stats_count, 

898 "RECORDS_INSERTED": -temp_stats["RECORDS"], 

899 "RECORDS_UPSERTED": 0, 

900 "TOTAL_RECORDS": -temp_stats["RECORDS"], 

901 "INSERT_SIZE_BYTES": -temp_stats["BYTES"] 

902 }) 

903 

904 if dedupe_: 

905 logging.info("Dropping DEDUPE table...") 

906 drop_deduped_table_clause = f"DROP TABLE IF EXISTS \"{self.database_schema_name}\".\"{deduped_table_name}\"" 

907 sdc_database_handle_.execute_query(drop_deduped_table_clause, return_results_=True) 

908 if logger_: 

909 write_stats_count += 1 

910 logger_.write_stats["STATS"].append({ 

911 "DEST_TYPE": LoggerEnums.SnowflakeWritesDestTypes.DEDUPED.value, 

912 "OPERATION": LoggerEnums.SnowflakeWritesOperations.DROP.value, 

913 "TABLE_NAME": deduped_table_name, 

914 "WRITE_STEP": write_stats_count, 

915 "RECORDS_INSERTED": -dedupe_stats["RECORDS"], 

916 "RECORDS_UPSERTED": 0, 

917 "TOTAL_RECORDS": -dedupe_stats["RECORDS"], 

918 "INSERT_SIZE_BYTES": -dedupe_stats["BYTES"] 

919 }) 

920 

921 elif sdc_database_handle_.type == SDCDBTypes.POSTGRES: 

922 raise Exception("Writing to Postgres not currently supported.") 

923 elif sdc_database_handle_.type == SDCDBTypes.MYSQL: 

924 raise Exception("Writing to MYSQL not currently supported.") 

925 elif sdc_database_handle_.type == SDCDBTypes.NEXUS: 

926 raise Exception("Writing to Nexus not currently supported.") 

927 else: 

928 raise Exception(f"Writing to {sdc_database_handle_.type.value} " f"not currently supported.") 

929 

930 elif self.type == SDCDFTypes.SPARK: 

931 raise Exception("Spark not currently supported") 

932 else: 

933 raise Exception("Invalid dataframe type to write.") 

934 

935 def write_to_parquet(self, **kwargs): 

936 """ 

937 This function will take a dataframe and return a file like object. 

938 to get all the raw data from the file object, use : file_obj.getvalue() 

939 :return: File like Object: 

940 """ 

941 

942 if self.type == SDCDFTypes.PANDAS: 

943 buffer = io.BytesIO() 

944 self.df.to_parquet(buffer, engine='auto', compression='snappy') 

945 buffer.seek(0) 

946 return buffer 

947 

948 elif self.type == SDCDFTypes.SPARK: 

949 raise Exception("Spark not currently supported") 

950 else: 

951 raise Exception("Invalid dataframe type to write.") 

952 

953 def write_to_csv(self, delimiter_=",", headers_=True, fieldnames_=[], **kwargs): 

954 """ 

955 This function will take a dataframe and return a file like object. 

956 to get all the raw data from the file object, use : file_obj.getvalue() 

957 :param delimiter_: File delimiter. 

958 :param headers_: Boolean. If true, file has headers. If false, file 

959 does not have headers. 

960 :param fieldnames_: List of field names (headers) of file. If not given, 

961 the first line of a file we be used as the headers. 

962 :return: File-like Object. 

963 """ 

964 fieldnames_ = [x.upper() for x in fieldnames_] 

965 if self.type == SDCDFTypes.PANDAS: 

966 buffer = io.StringIO() 

967 self.df.to_csv(path_or_buf=buffer, index=False, sep=delimiter_, header=headers_, columns=fieldnames_) 

968 

969 buffer.seek(0) 

970 buffer = io.BytesIO(buffer.read().encode()) 

971 buffer.seek(0) 

972 

973 return buffer 

974 

975 elif self.type == SDCDFTypes.SPARK: 

976 raise Exception("Spark not currently supported") 

977 else: 

978 raise Exception("Invalid dataframe type to write.") 

979 

980 def write_to_edi(self, **kwargs): 

981 """ 

982 This function will take a dataframe and return a edi file object 

983 :return: BytesIO 

984 """ 

985 logging.info("EXECUTING write_to_edi") 

986 if self.type == SDCDFTypes.PANDAS: 

987 buffer = io.BytesIO() 

988 edi_object = SDCEDIPandasConverter.convert_dataframe_to_edi_object(self.df) 

989 buffer.write(edi_object.encode()) 

990 buffer.seek(0) 

991 return buffer 

992 

993 elif self.type == SDCDFTypes.SPARK: 

994 raise Exception("Spark not currently supported") 

995 else: 

996 raise Exception("Invalid dataframe type to write.") 

997 

998 def get_as_file_obj_from_endpoint_schema(self, endpoint_schema_): 

999 """ 

1000 This function will take a dataframe and return a file-like object. 

1001 Note: Endpoint schema must be form the same schema that was used to 

1002 construct the SDCDF 

1003 :param endpoint_schema_: JSON data schema of endpoint. 

1004 :return: File like Object 

1005 """ 

1006 file_type = endpoint_schema_["info"]["file_info"]["type"] 

1007 

1008 if file_type == "csv": 

1009 args = SchemaToolbox.generate_file_output_args(self.schema_json, endpoint_schema_) 

1010 return self.write_to_csv(**args) 

1011 elif file_type == "parquet": 

1012 args = SchemaToolbox.generate_file_output_args(self.schema_json, endpoint_schema_) 

1013 return self.write_to_parquet(**args) if args else self.write_to_parquet() 

1014 elif file_type == "edi": 

1015 args = SchemaToolbox.generate_file_output_args(self.schema_json, endpoint_schema_) 

1016 return self.write_to_edi(**args) if args else self.write_to_edi() 

1017 else: 

1018 raise Exception("Invalid dataframe output type.") 

1019 

1020 def scrub_pii(self): 

1021 ''' 

1022 ''' 

1023 pass 

1024 

1025 def perform_column_transformations(self, transformation_type_): 

1026 """ 

1027 Takes a list of columns with transformations from the schema and applies them. 

1028 UDFFactory.get_udf return a UDF according with the name requested in the transformation. 

1029 :param: transformation_type: 

1030 :type: TransformationType 

1031 :return: None 

1032 """ 

1033 if self.type == SDCDFTypes.PANDAS: 

1034 for column in filter(lambda column: "transformations" in column, self.schema_json["fields"]): 

1035 for transformation in filter(lambda t: t["transformation_type"] == transformation_type_.value, 

1036 column["transformations"]): 

1037 logging.info("applying column transformation %s", transformation["type"]) 

1038 axis = transformation.get('axis', 'columns') 

1039 transformation["opts"]["origin_column_name"] = column["name"] 

1040 self.df[column["name"].upper()] = self.df.apply( 

1041 lambda row: UDFFactory.get_udf(df_type_=self.type, udf_name_=transformation["type"]).apply_udf( 

1042 row, **transformation["opts"]), 

1043 axis=axis) 

1044 else: 

1045 raise Exception("Invalid dataframe type to perform_column_transformations.") 

1046 

1047 def perform_dataframe_transformations(self, transformation_type_): 

1048 """ 

1049 Takes a list of columns with transformations from the schema and applies them. 

1050 UDFFactory.get_udf return a UDF according with the name requested in the transformation. 

1051 :param: transformation_type: 

1052 :type: TransformationType 

1053 :return: None 

1054 """ 

1055 if self.type == SDCDFTypes.PANDAS: 

1056 if self.schema_json.get('dataframe_transformations'): 

1057 for transformation in [ 

1058 transformation for transformation in self.schema_json['dataframe_transformations'] 

1059 if transformation['transformation_type'] == transformation_type_.value 

1060 ]: 

1061 logging.info("applying dataframe transformation %s", transformation["type"]) 

1062 UDFFactory.get_udf( 

1063 df_type_=self.type, 

1064 udf_name_=transformation["type"]).apply_udf(self.df, **transformation["opts"]) 

1065 else: 

1066 raise Exception("Invalid dataframe type to perform_column_transformations.") 

1067 

1068 def drop_columns(self, column_list_): 

1069 """ 

1070 Drops existing columns from dataframe inplace. 

1071 :param column_list_: List of column names to drop. 

1072 :return: Nothing 

1073 """ 

1074 

1075 if column_list_ and isinstance(column_list_, list): 

1076 column_list = list(set(column_list_)) 

1077 for column in column_list: 

1078 if column not in self.df.columns.tolist(): 

1079 logging.warning(f"Column '{column}' already does not exist " f"in DataFrame. Skipping...") 

1080 column_list.remove(column) 

1081 logging.info(f"Dropping columns {column_list}") 

1082 self.df.drop(column_list, axis=1, inplace=True) 

1083 logging.info("Columns dropped!") 

1084 

1085 else: 

1086 raise Exception("Must past a list of columns to drop.") 

1087 

1088 def cleanse_column_names(self, style_): 

1089 """ 

1090 Renames column in dataframe via a preferred styling. Useful when needing 

1091 to write to a sink that is sensitive to special characters or other 

1092 naming conventions and we have no control over the source columnn 

1093 naming. 

1094 :param style_: Style for column renaming. Current options: 

1095 - "snowflake_friendly": Allows letters, numbers and underscores 

1096 only. Special characters become "_". "#" changed to "NUM" to 

1097 retain meaning. Example: 

1098 "Default #: Addresses : Country (Full Name)" 

1099 becomes... 

1100 "DEFAULT_NUM_ADDRESSES__COUNTRY_FULL_NAME" 

1101 :return: None 

1102 """ 

1103 if style_.upper() in StyleEnums._member_names_: 

1104 if self.type == SDCDFTypes.PANDAS: 

1105 

1106 cols = self.df.columns.to_list() 

1107 

1108 new_cols = [SnowflakeDatabase.clean_column_name(col, StyleEnums[style_.upper()]) for col in cols] 

1109 self.df.columns = new_cols 

1110 

1111 elif self.type == SDCDFTypes.SPARK: 

1112 raise Exception("Spark dataframes unsupported currently.") 

1113 

1114 else: 

1115 raise Exception(f"'{style_}' not an option.") 

1116 

1117 def fill_in_column(self, column_name_, column_value_, create_column_=False): 

1118 """ 

1119 Updates a dataframe column with a new value (column-wide). 

1120 except when the column is _ETL_FILENAME and the dataframe is a block of files. 

1121 :param column_name_: Name of dataframe column to be updated. 

1122 :param column_value_: Value to update dataframe column with. 

1123 :param create_column_: Boolean. If True, and if column to update does 

1124 not exists in dataframe, create the column and fill in values. 

1125 :return: None 

1126 """ 

1127 

1128 column_name = column_name_.upper() 

1129 col_exists = column_name in self.df.columns 

1130 if not create_column_ and not col_exists: 

1131 raise Exception(f"Cannot load values to column {column_name} as " 

1132 f"it does not exists in df. Cannot create unless " 

1133 f"explicitly allowed.") 

1134 else: 

1135 logging.info(f"Setting column {column_name} with value '{column_value_}'") 

1136 self.df[column_name] = column_value_ 

1137 

1138 def log_dataframe(self, title_="Dataframe", log_memory_usage_=True): 

1139 """ 

1140 Logs a Dataframe and its related memory usage stats to console. 

1141 

1142 :param title_: A title that describes the content of the Dataframe. 

1143 :type title_: String 

1144 :param log_memory_usage_: Whether or not to log memory usage. 

1145 :type log_memory_usage_: Boolean 

1146 :return: None 

1147 """ 

1148 try: 

1149 if self.type == SDCDFTypes.PANDAS: 

1150 pd.set_option('display.max_rows', 500) 

1151 pd.set_option('display.max_columns', 500) 

1152 logging.info(title_) 

1153 logging.info("----------------------------") 

1154 logging.info(self.df) 

1155 logging.info(self.df.shape) 

1156 if log_memory_usage_: 

1157 logging.info(self.df.info(memory_usage="deep")) 

1158 elif self.type == SDCDFTypes.SPARK: 

1159 raise Exception("Spark not currently supported.") 

1160 else: 

1161 raise Exception("Invalid dataframe type to log.") 

1162 except AttributeError: 

1163 raise Exception(f"{title_} is empty.") 

1164 

1165 def compare_data_schema(self): 

1166 ''' 

1167 ''' 

1168 pass 

1169 

1170 def get_schema_from_registry(self): 

1171 ''' 

1172 ''' 

1173 pass