Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1import datetime 

2import sys 

3import os 

4sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../../../") 

5from sdc_etl_libs.sdc_dataframe.Dataframe import * 

6from sdc_etl_libs.sdc_dataframe.SDCDataframeEnums import SDCDFPandasTypes 

7import pandas as pd 

8import numpy as np 

9import json 

10import pytest 

11 

12def test_schema_generation(): 

13 test_schema = """ 

14 { 

15 "namespace": "Dataframe", 

16 "type": "object", 

17 "name": "I am a test", 

18 "fields": [ 

19 {"name":"Id1","type":{"type":"string"}}, 

20 {"name":"Id2","type":{"type":"int"}}, 

21 {"name":"Id3","type":{"type":"boolean"}}, 

22 {"name":"Id4","type":{"type":"long"}}, 

23 {"name":"Id5","type":{"type":"float"}}, 

24 {"name":"Id6","type":{"type":"double"}}, 

25 {"name":"Id7","type":{"type":"string", "logical_type":"datetime"}} 

26 ] 

27 } 

28 """ 

29 

30 expected_result = [{"ID1": np.object_},{"ID2":pd.Int32Dtype()},{"ID3":np.bool_},{"ID4":pd.Int64Dtype()},{"ID5":np.float32},{"ID6":np.float64},{"ID7": np.datetime64}] 

31 df = Dataframe(SDCDFTypes.PANDAS, test_schema) 

32 assert len(df.df_schema) == len(expected_result) 

33 for i in range(len(expected_result)): 

34 assert(df.df_schema[i] == expected_result[i]) 

35 

36def test_load_data(): 

37 test_schema = """ 

38 { 

39 "namespace": "Dataframe", 

40 "type": "object", 

41 "name": "I am a test", 

42 "fields": [ 

43 {"name":"Id1","type":{"type":"string"}}, 

44 {"name":"Id2","type":{"type":"int"}}, 

45 {"name":"Id3","type":{"type":"boolean"}}, 

46 {"name":"Id4","type":{"type":"long"}}, 

47 {"name":"Id5","type":{"type":"float"}}, 

48 {"name":"Id6","type":{"type":"double"}}, 

49 {"name":"Id7","type":{"type":"string", "logical_type":"datetime", "add_column":true}} 

50 ] 

51 } 

52 """ 

53 test_data = """ 

54 { 

55 "Id1":"foo", 

56 "Id2":10, 

57 "Id3": false, 

58 "Id4":223372036854777807, 

59 "Id5":5.555, 

60 "Id6": 2000.55, 

61 "Id7": "2019-07-09 22:32:33.135693+00:00" 

62 } 

63 """ 

64 df = Dataframe(SDCDFTypes.PANDAS, test_schema) 

65 df.load_data([json.loads(test_data)]) 

66 assert df.df.iloc[0]["ID1"] == "foo" 

67 assert df.df.iloc[0]["ID2"] == 10 

68 assert df.df.iloc[0]["ID3"] == False 

69 assert df.df.iloc[0]["ID4"] == 223372036854777807 

70 assert math.isclose(df.df.iloc[0]["ID5"], 5.555, abs_tol=0.001) 

71 assert math.isclose(df.df.iloc[0]["ID6"], 2000.55, abs_tol=0.01) 

72 assert pd.Timestamp(df.df.iloc[0]["ID7"]).strftime("%Y-%m-%d %H:%M:%S") == '2019-07-09 22:32:33' 

73 

74 assert df.df["ID1"].dtype == SDCDFPandasTypes.string.value 

75 assert df.df["ID2"].dtype == SDCDFPandasTypes.int.value 

76 assert df.df["ID3"].dtype == np.object_ 

77 assert df.df["ID4"].dtype == SDCDFPandasTypes.long.value 

78 assert df.df["ID5"].dtype == SDCDFPandasTypes.float.value 

79 assert df.df["ID6"].dtype == SDCDFPandasTypes.double.value 

80 assert df.df["ID7"].dtype == SDCDFPandasTypes.datetime.value or df.df["ID7"].dtype == np.dtype('>M8[ns]') or df.df["ID7"].dtype == np.dtype('<M8[ns]') 

81 

82 

83def test_bad_data(mocker): 

84 mocker.patch('pandas.Timestamp.utcnow', return_value=pd.Timestamp('2019-07-09 22:32:33.135693+00:00').tz_localize(None)) 

85 test_schema = """ 

86 { 

87 "namespace": "Dataframe", 

88 "type": "object", 

89 "name": "I am a test", 

90 "fields": [ 

91 {"name":"Id1","type":{"type":"string"}}, 

92 {"name":"Id2","type":{"type":"int"}}, 

93 {"name":"Id3","type":{"type":"boolean"}}, 

94 {"name":"Id4","type":{"type":"long"}}, 

95 {"name":"Id5","type":{"type":"float"}}, 

96 {"name":"Id6","type":{"type":"double"}}, 

97 {"name":"Id7","type":{"type":"string", "logical_type":"datetime"}} 

98 ] 

99 } 

100 """ 

101 test_data = """ 

102 { 

103 "Id1":"foo", 

104 "Id2":"", 

105 "Id3": false, 

106 "Id4":223372036854777807, 

107 "Id5":5.555, 

108 "Id6": 2000.55, 

109 "Id7": "" 

110 } 

111 """ 

112 df = Dataframe(SDCDFTypes.PANDAS, test_schema) 

113 df.load_data([json.loads(test_data)]) 

114 assert df.df.iloc[0]["ID1"] == "foo" 

115 assert pd.isnull(df.df.iloc[0]["ID2"]) 

116 assert df.df.iloc[0]["ID3"] == False 

117 assert df.df.iloc[0]["ID4"] == 223372036854777807 

118 assert math.isclose(df.df.iloc[0]["ID5"], 5.555, abs_tol=0.001) 

119 assert math.isclose(df.df.iloc[0]["ID6"], 2000.55, abs_tol=0.01) 

120 assert pd.isnull(df.df.iloc[0]["ID7"]) 

121 

122 

123def test_missing_data(mocker): 

124 mocker.patch('pandas.Timestamp.utcnow', return_value=pd.Timestamp('2019-07-09 22:32:33.135693+00:00').tz_localize(None)) 

125 test_schema = """ 

126 { 

127 "namespace": "Dataframe", 

128 "type": "object", 

129 "name": "I am a test", 

130 "fields": [ 

131 {"name":"Id1","type":{"type":"string"}}, 

132 {"name":"Id2","type":{"type":"int"}}, 

133 {"name":"Id3","type":{"type":"boolean"}}, 

134 {"name":"Id4","type":{"type":"long"}}, 

135 {"name":"Id5","type":{"type":"float"}}, 

136 {"name":"Id6","type":{"type":"double"}}, 

137 {"name":"Id7","type":{"type":"string", "logical_type":"datetime"}} 

138 ] 

139 } 

140 """ 

141 test_data = """ 

142 { 

143 "Id1":"foo", 

144 "Id2":10, 

145 "Id3": false, 

146 "Id4":223372036854777807, 

147 "Id5":5.555, 

148 "Id6": 2000.55 

149 } 

150 """ 

151 df = Dataframe(SDCDFTypes.PANDAS, test_schema) 

152 with pytest.raises(Exception): 

153 df.load_data([json.loads(test_data)]) 

154 

155def test_unexpected_data(mocker): 

156 mocker.patch('pandas.Timestamp.utcnow', return_value=pd.Timestamp('2019-07-09 22:32:33.135693+00:00').tz_localize(None)) 

157 test_schema = """ 

158 { 

159 "namespace": "Dataframe", 

160 "type": "object", 

161 "name": "I am a test", 

162 "fields": [ 

163 {"name":"Id1","type":{"type":"string"}}, 

164 {"name":"Id2","type":{"type":"int"}}, 

165 {"name":"Id3","type":{"type":"boolean"}}, 

166 {"name":"Id4","type":{"type":"long"}}, 

167 {"name":"Id5","type":{"type":"float"}}, 

168 {"name":"Id6","type":{"type":"double"}}, 

169 {"name":"Id7","type":{"type":"string", "logical_type":"datetime"}} 

170 ] 

171 } 

172 """ 

173 test_data = """ 

174 { 

175 "Id1":"foo", 

176 "Id2":10, 

177 "Id3": false, 

178 "Id4":223372036854777807, 

179 "Id5":5.555, 

180 "Id6": 2000.55, 

181 "Id7": "2019-07-09 22:32:33.135693+00:00", 

182 "Id8" : "cool" 

183 } 

184 """ 

185 

186 df = Dataframe(SDCDFTypes.PANDAS, test_schema) 

187 df.load_data([json.loads(test_data)]) 

188 assert("ID8" not in df.df.columns) 

189 

190 

191def test_nullable_data(mocker): 

192 mocker.patch('pandas.Timestamp.utcnow', return_value=pd.Timestamp('2019-07-09 22:32:33.135693+00:00').tz_localize(None)) 

193 test_schema = """ 

194 { 

195 "namespace": "Dataframe", 

196 "type": "object", 

197 "name": "I am a test", 

198 "fields": [ 

199 {"name":"Id1","type":{"type":"string"}}, 

200 {"name":"Id2","type":{"type":"int"}}, 

201 {"name":"Id3","type":{"type":"boolean"}}, 

202 {"name":"Id4","type":{"type":"long"}}, 

203 {"name":"Id5","type":{"type":"float"}}, 

204 {"name":"Id6","type":{"type":"double"}}, 

205 {"name":"Id7","type":{"type":"string", "logical_type":"datetime"}}, 

206 {"name":"Id9","type":{"type":"int"}, "is_nullable":true} 

207 ] 

208 } 

209 """ 

210 test_data = """ 

211 { 

212 "Id1":"foo", 

213 "Id2":10, 

214 "Id3": false, 

215 "Id4":223372036854777807, 

216 "Id5":5.555, 

217 "Id6": 2000.55, 

218 "Id7": "2019-07-09 22:32:33.135693+00:00", 

219 "Id8" : "cool" 

220 } 

221 """ 

222 

223 df = Dataframe(SDCDFTypes.PANDAS, test_schema) 

224 df.load_data([json.loads(test_data)]) 

225 assert("ID9" in df.df.columns) 

226 

227 

228def test_find_json_columns(mocker): 

229 

230 test_schema = """ 

231 { 

232 "namespace": "", 

233 "type": "object", 

234 "name": "", 

235 "country_code": "USA", 

236 "data_sink": {"type":"snowflake", "database": "", "table_name": "", "schema": ""}, 

237 "data_source": {"type": "api", "base_url": ""}, 

238 "fields": [ 

239 {"name":"CODE","type":{"type":"string"}}, 

240 {"name":"DESCRIPTION","type":{"type":"string"}}, 

241 {"name":"WBS","type":{"type":"string","logical_type":"json"}}, 

242 {"name":"BANKS","type":{"type":"string","logical_type":"json"}}, 

243 {"name":"LASTMODIFIEDAT","type":{"type":"string","logical_type":"datetime"}}, 

244 {"name":"LASTMODIFIEDBY","type":{"type":"string"}}, 

245 {"name":"BUDGETHOURS","type":{"type":"float"}}, 

246 {"name":"BUDGETCOST","type":{"type":"float"}}, 

247 {"name":"STARTDATE","type":{"type":"string", "logical_type":"datetime"}, "is_nullable": true}, 

248 {"name":"FINISHDATE","type":{"type":"string", "logical_type":"datetime"}, "is_nullable": true}, 

249 {"name":"PARENTKEY","type":{"type":"int"}, "is_nullable": true}, 

250 {"name":"STATUS","type":{"type":"string"}}, 

251 {"name":"PROJECT","type":{"type":"string","logical_type":"json"}}, 

252 {"name":"USERFIELDS","type":{"type":"string","logical_type":"json"}}, 

253 {"name":"_METADATA","type":{"type":"string","logical_type":"json"}}, 

254 {"name":"KEY","type":{"type":"int"},"sf_merge_key": true}, 

255 {"name":"_SF_INSERTEDDATETIME","type":{"type":"string","logical_type":"datetime", "add_column": true }} 

256 ] 

257 }""" 

258 

259 df = Dataframe(SDCDFTypes.PANDAS, test_schema) 

260 assert len(df.json_columns) == 5 

261 

262 

263def test_renaming_fields(): 

264 test_schema = """ 

265 { 

266 "namespace": "Test", 

267 "name": "test", 

268 "type": "object", 

269 "country_code": "USA", 

270 "estimated_row_size": "10b", 

271 "estimated_row_count": 3000, 

272 "contains_pii": false, 

273 "fields": [ 

274 { 

275 "name": "FIELD1", 

276 "rename": "ID", 

277 "type": { 

278 "type": "int" 

279 } 

280 }, 

281 { 

282 "name": "FIELD2", 

283 "rename": "NAME", 

284 "type": { 

285 "type": "string" 

286 } 

287 } 

288 ] 

289 } 

290 """ 

291 test_data = """ 

292 { 

293 "FIELD1": 1, 

294 "FIELD2": "foo" 

295 } 

296 """ 

297 df = Dataframe(SDCDFTypes.PANDAS, test_schema) 

298 df.load_data([json.loads(test_data)]) 

299 assert df.df.iloc[0]["ID"] == 1 

300 assert df.df.iloc[0]["NAME"] == "foo" 

301 assert "FIELD1" not in df.df.columns 

302 assert "FIELD2" not in df.df.columns 

303 

304 

305def test_default_value(): 

306 test_schema = """ 

307 { 

308 "namespace": "Test", 

309 "name": "test", 

310 "type": "object", 

311 "country_code": "USA", 

312 "estimated_row_size": "10b", 

313 "estimated_row_count": 3000, 

314 "contains_pii": false, 

315 "fields": [ 

316 { 

317 "name": "ID", 

318 "type": { 

319 "type": "int" 

320 }, 

321 "default_value": 1 

322 }, 

323 { 

324 "name": "NAME", 

325 "type": { 

326 "type": "string" 

327 }, 

328 "default_value": "foo" 

329 } 

330 ] 

331 } 

332 """ 

333 test_data = """ 

334 { 

335 "ID": null, 

336 "NAME": "" 

337 } 

338 """ 

339 df = Dataframe(SDCDFTypes.PANDAS, test_schema) 

340 df.load_data([json.loads(test_data)]) 

341 assert df.df.iloc[0]["ID"] == 1 

342 assert df.df.iloc[0]["NAME"] == "foo" 

343 

344 

345def test_default_value_false(): 

346 test_schema = """ 

347 { 

348 "namespace": "Test", 

349 "name": "test", 

350 "type": "object", 

351 "country_code": "USA", 

352 "estimated_row_size": "10b", 

353 "estimated_row_count": 3000, 

354 "contains_pii": false, 

355 "fields": [ 

356 { 

357 "name": "ID", 

358 "type": { 

359 "type": "int" 

360 }, 

361 "default_value": 0 

362 }, 

363 { 

364 "name": "TEST", 

365 "type": { 

366 "type": "boolean" 

367 }, 

368 "default_value": false 

369 } 

370 ] 

371 } 

372 """ 

373 test_data = """ 

374 { 

375 "ID": null, 

376 "TEST": null 

377 } 

378 """ 

379 df = Dataframe(SDCDFTypes.PANDAS, test_schema) 

380 df.load_data([json.loads(test_data)]) 

381 assert df.df.iloc[0]["ID"] == 0 

382 assert df.df.iloc[0]["TEST"] == False 

383 

384 

385def test_rename_with_default_value(): 

386 test_schema = """ 

387 { 

388 "namespace": "Test", 

389 "name": "test", 

390 "type": "object", 

391 "country_code": "USA", 

392 "estimated_row_size": "10b", 

393 "estimated_row_count": 3000, 

394 "contains_pii": false, 

395 "fields": [ 

396 { 

397 "name": "FIELD1", 

398 "rename": "ID", 

399 "type": { 

400 "type": "int" 

401 }, 

402 "default_value": 1 

403 }, 

404 { 

405 "name": "FIELD2", 

406 "rename": "NAME", 

407 "type": { 

408 "type": "string" 

409 }, 

410 "default_value": "foo" 

411 } 

412 ] 

413 } 

414 """ 

415 test_data = """ 

416 { 

417 "FIELD1": null, 

418 "FIELD2": null 

419 } 

420 """ 

421 df = Dataframe(SDCDFTypes.PANDAS, test_schema) 

422 df.load_data([json.loads(test_data)]) 

423 assert df.df.iloc[0]["ID"] == 1 

424 assert df.df.iloc[0]["NAME"] == "foo" 

425 assert "FIELD1" not in df.df.columns 

426 assert "FIELD2" not in df.df.columns 

427 

428 

429def test_boolean_null_values(): 

430 test_schema = """ 

431 { 

432 "namespace": "Test", 

433 "name": "test", 

434 "type": "object", 

435 "country_code": "USA", 

436 "estimated_row_size": "10b", 

437 "estimated_row_count": 3000, 

438 "contains_pii": false, 

439 "fields": [ 

440 { 

441 "name": "TEST", 

442 "type": { 

443 "type": "boolean" 

444 } 

445 } 

446 ] 

447 } 

448 """ 

449 test_data = [ 

450 { 

451 "TEST": "TRUE" 

452 }, 

453 { 

454 "TEST": "FALSE" 

455 }, 

456 { 

457 "TEST": "1" 

458 }, 

459 { 

460 "TEST": "0" 

461 }, 

462 { 

463 "TEST": "Yes" 

464 }, 

465 { 

466 "TEST": "N" 

467 }, 

468 { 

469 "TEST": True 

470 }, 

471 { 

472 "TEST": False 

473 }, 

474 { 

475 "TEST": 1 

476 }, 

477 { 

478 "TEST": 0 

479 }, 

480 { 

481 "TEST": None 

482 } 

483 ] 

484 df = Dataframe(SDCDFTypes.PANDAS, test_schema) 

485 df.load_data(test_data) 

486 assert df.df.iloc[0]["TEST"] == True 

487 assert df.df.iloc[1]["TEST"] == False 

488 assert df.df.iloc[2]["TEST"] == True 

489 assert df.df.iloc[3]["TEST"] == False 

490 assert df.df.iloc[4]["TEST"] == True 

491 assert df.df.iloc[5]["TEST"] == False 

492 assert df.df.iloc[6]["TEST"] == True 

493 assert df.df.iloc[7]["TEST"] == False 

494 assert df.df.iloc[8]["TEST"] == True 

495 assert df.df.iloc[9]["TEST"] == False 

496 assert pd.isnull(df.df.iloc[10]["TEST"]) 

497 

498def test_add_column_with_default_value(): 

499 test_schema = """ 

500 { 

501 "namespace": "Test", 

502 "name": "test", 

503 "type": "object", 

504 "country_code": "USA", 

505 "estimated_row_size": "10b", 

506 "estimated_row_count": 3000, 

507 "contains_pii": false, 

508 "fields": [ 

509 { 

510 "name": "ID", 

511 "type": { 

512 "type": "int" 

513 } 

514 }, 

515 { 

516 "name": "TEST", 

517 "type": { 

518 "type": "string" 

519 }, 

520 "add_column": true, 

521 "default_value": "foo" 

522 } 

523 ] 

524 } 

525 """ 

526 test_data = [ 

527 { 

528 "ID": 1 

529 }, 

530 { 

531 "ID": 2 

532 }, 

533 { 

534 "ID": 3 

535 }, 

536 ] 

537 df = Dataframe(SDCDFTypes.PANDAS, test_schema) 

538 df.load_data(test_data) 

539 assert df.df["TEST"].all() == "foo" 

540 

541 

542def test_add_column_current_timestamp(): 

543 test_schema = """ 

544 { 

545 "namespace": "Test", 

546 "name": "test", 

547 "type": "object", 

548 "country_code": "USA", 

549 "estimated_row_size": "10b", 

550 "estimated_row_count": 3000, 

551 "contains_pii": false, 

552 "fields": [ 

553 { 

554 "name": "ID", 

555 "type": { 

556 "type": "int" 

557 } 

558 }, 

559 { 

560 "name": "_SF_INSERTEDDATETIME", 

561 "type": { 

562 "type": "string", 

563 "logical_type": "datetime" 

564 }, 

565 "add_column": true 

566 } 

567 ] 

568 } 

569 """ 

570 test_data = [ 

571 { 

572 "ID": 1 

573 } 

574 ] 

575 df = Dataframe(SDCDFTypes.PANDAS, test_schema) 

576 start_time = datetime.datetime.utcnow() 

577 df.load_data(test_data) 

578 end_time = datetime.datetime.utcnow() 

579 assert df.df.iloc[0]["_SF_INSERTEDDATETIME"] >= start_time 

580 assert df.df.iloc[0]["_SF_INSERTEDDATETIME"] <= end_time 

581 assert df.df["_SF_INSERTEDDATETIME"].dtype == SDCDFPandasTypes.datetime.value \ 

582 or df.df["_SF_INSERTEDDATETIME"].dtype == np.dtype('>M8[ns]') \ 

583 or df.df["_SF_INSERTEDDATETIME"].dtype == np.dtype('<M8[ns]')