Coverage for libs/sdc_etl_libs/tests/dataframe_tests/sdc_dataframe_pandas_test.py : 100%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1import datetime
2import sys
3import os
4sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../../../")
5from sdc_etl_libs.sdc_dataframe.Dataframe import *
6from sdc_etl_libs.sdc_dataframe.SDCDataframeEnums import SDCDFPandasTypes
7import pandas as pd
8import numpy as np
9import json
10import pytest
12def test_schema_generation():
13 test_schema = """
14 {
15 "namespace": "Dataframe",
16 "type": "object",
17 "name": "I am a test",
18 "fields": [
19 {"name":"Id1","type":{"type":"string"}},
20 {"name":"Id2","type":{"type":"int"}},
21 {"name":"Id3","type":{"type":"boolean"}},
22 {"name":"Id4","type":{"type":"long"}},
23 {"name":"Id5","type":{"type":"float"}},
24 {"name":"Id6","type":{"type":"double"}},
25 {"name":"Id7","type":{"type":"string", "logical_type":"datetime"}}
26 ]
27 }
28 """
30 expected_result = [{"ID1": np.object_},{"ID2":pd.Int32Dtype()},{"ID3":np.bool_},{"ID4":pd.Int64Dtype()},{"ID5":np.float32},{"ID6":np.float64},{"ID7": np.datetime64}]
31 df = Dataframe(SDCDFTypes.PANDAS, test_schema)
32 assert len(df.df_schema) == len(expected_result)
33 for i in range(len(expected_result)):
34 assert(df.df_schema[i] == expected_result[i])
36def test_load_data():
37 test_schema = """
38 {
39 "namespace": "Dataframe",
40 "type": "object",
41 "name": "I am a test",
42 "fields": [
43 {"name":"Id1","type":{"type":"string"}},
44 {"name":"Id2","type":{"type":"int"}},
45 {"name":"Id3","type":{"type":"boolean"}},
46 {"name":"Id4","type":{"type":"long"}},
47 {"name":"Id5","type":{"type":"float"}},
48 {"name":"Id6","type":{"type":"double"}},
49 {"name":"Id7","type":{"type":"string", "logical_type":"datetime", "add_column":true}}
50 ]
51 }
52 """
53 test_data = """
54 {
55 "Id1":"foo",
56 "Id2":10,
57 "Id3": false,
58 "Id4":223372036854777807,
59 "Id5":5.555,
60 "Id6": 2000.55,
61 "Id7": "2019-07-09 22:32:33.135693+00:00"
62 }
63 """
64 df = Dataframe(SDCDFTypes.PANDAS, test_schema)
65 df.load_data([json.loads(test_data)])
66 assert df.df.iloc[0]["ID1"] == "foo"
67 assert df.df.iloc[0]["ID2"] == 10
68 assert df.df.iloc[0]["ID3"] == False
69 assert df.df.iloc[0]["ID4"] == 223372036854777807
70 assert math.isclose(df.df.iloc[0]["ID5"], 5.555, abs_tol=0.001)
71 assert math.isclose(df.df.iloc[0]["ID6"], 2000.55, abs_tol=0.01)
72 assert pd.Timestamp(df.df.iloc[0]["ID7"]).strftime("%Y-%m-%d %H:%M:%S") == '2019-07-09 22:32:33'
74 assert df.df["ID1"].dtype == SDCDFPandasTypes.string.value
75 assert df.df["ID2"].dtype == SDCDFPandasTypes.int.value
76 assert df.df["ID3"].dtype == np.object_
77 assert df.df["ID4"].dtype == SDCDFPandasTypes.long.value
78 assert df.df["ID5"].dtype == SDCDFPandasTypes.float.value
79 assert df.df["ID6"].dtype == SDCDFPandasTypes.double.value
80 assert df.df["ID7"].dtype == SDCDFPandasTypes.datetime.value or df.df["ID7"].dtype == np.dtype('>M8[ns]') or df.df["ID7"].dtype == np.dtype('<M8[ns]')
83def test_bad_data(mocker):
84 mocker.patch('pandas.Timestamp.utcnow', return_value=pd.Timestamp('2019-07-09 22:32:33.135693+00:00').tz_localize(None))
85 test_schema = """
86 {
87 "namespace": "Dataframe",
88 "type": "object",
89 "name": "I am a test",
90 "fields": [
91 {"name":"Id1","type":{"type":"string"}},
92 {"name":"Id2","type":{"type":"int"}},
93 {"name":"Id3","type":{"type":"boolean"}},
94 {"name":"Id4","type":{"type":"long"}},
95 {"name":"Id5","type":{"type":"float"}},
96 {"name":"Id6","type":{"type":"double"}},
97 {"name":"Id7","type":{"type":"string", "logical_type":"datetime"}}
98 ]
99 }
100 """
101 test_data = """
102 {
103 "Id1":"foo",
104 "Id2":"",
105 "Id3": false,
106 "Id4":223372036854777807,
107 "Id5":5.555,
108 "Id6": 2000.55,
109 "Id7": ""
110 }
111 """
112 df = Dataframe(SDCDFTypes.PANDAS, test_schema)
113 df.load_data([json.loads(test_data)])
114 assert df.df.iloc[0]["ID1"] == "foo"
115 assert pd.isnull(df.df.iloc[0]["ID2"])
116 assert df.df.iloc[0]["ID3"] == False
117 assert df.df.iloc[0]["ID4"] == 223372036854777807
118 assert math.isclose(df.df.iloc[0]["ID5"], 5.555, abs_tol=0.001)
119 assert math.isclose(df.df.iloc[0]["ID6"], 2000.55, abs_tol=0.01)
120 assert pd.isnull(df.df.iloc[0]["ID7"])
123def test_missing_data(mocker):
124 mocker.patch('pandas.Timestamp.utcnow', return_value=pd.Timestamp('2019-07-09 22:32:33.135693+00:00').tz_localize(None))
125 test_schema = """
126 {
127 "namespace": "Dataframe",
128 "type": "object",
129 "name": "I am a test",
130 "fields": [
131 {"name":"Id1","type":{"type":"string"}},
132 {"name":"Id2","type":{"type":"int"}},
133 {"name":"Id3","type":{"type":"boolean"}},
134 {"name":"Id4","type":{"type":"long"}},
135 {"name":"Id5","type":{"type":"float"}},
136 {"name":"Id6","type":{"type":"double"}},
137 {"name":"Id7","type":{"type":"string", "logical_type":"datetime"}}
138 ]
139 }
140 """
141 test_data = """
142 {
143 "Id1":"foo",
144 "Id2":10,
145 "Id3": false,
146 "Id4":223372036854777807,
147 "Id5":5.555,
148 "Id6": 2000.55
149 }
150 """
151 df = Dataframe(SDCDFTypes.PANDAS, test_schema)
152 with pytest.raises(Exception):
153 df.load_data([json.loads(test_data)])
155def test_unexpected_data(mocker):
156 mocker.patch('pandas.Timestamp.utcnow', return_value=pd.Timestamp('2019-07-09 22:32:33.135693+00:00').tz_localize(None))
157 test_schema = """
158 {
159 "namespace": "Dataframe",
160 "type": "object",
161 "name": "I am a test",
162 "fields": [
163 {"name":"Id1","type":{"type":"string"}},
164 {"name":"Id2","type":{"type":"int"}},
165 {"name":"Id3","type":{"type":"boolean"}},
166 {"name":"Id4","type":{"type":"long"}},
167 {"name":"Id5","type":{"type":"float"}},
168 {"name":"Id6","type":{"type":"double"}},
169 {"name":"Id7","type":{"type":"string", "logical_type":"datetime"}}
170 ]
171 }
172 """
173 test_data = """
174 {
175 "Id1":"foo",
176 "Id2":10,
177 "Id3": false,
178 "Id4":223372036854777807,
179 "Id5":5.555,
180 "Id6": 2000.55,
181 "Id7": "2019-07-09 22:32:33.135693+00:00",
182 "Id8" : "cool"
183 }
184 """
186 df = Dataframe(SDCDFTypes.PANDAS, test_schema)
187 df.load_data([json.loads(test_data)])
188 assert("ID8" not in df.df.columns)
191def test_nullable_data(mocker):
192 mocker.patch('pandas.Timestamp.utcnow', return_value=pd.Timestamp('2019-07-09 22:32:33.135693+00:00').tz_localize(None))
193 test_schema = """
194 {
195 "namespace": "Dataframe",
196 "type": "object",
197 "name": "I am a test",
198 "fields": [
199 {"name":"Id1","type":{"type":"string"}},
200 {"name":"Id2","type":{"type":"int"}},
201 {"name":"Id3","type":{"type":"boolean"}},
202 {"name":"Id4","type":{"type":"long"}},
203 {"name":"Id5","type":{"type":"float"}},
204 {"name":"Id6","type":{"type":"double"}},
205 {"name":"Id7","type":{"type":"string", "logical_type":"datetime"}},
206 {"name":"Id9","type":{"type":"int"}, "is_nullable":true}
207 ]
208 }
209 """
210 test_data = """
211 {
212 "Id1":"foo",
213 "Id2":10,
214 "Id3": false,
215 "Id4":223372036854777807,
216 "Id5":5.555,
217 "Id6": 2000.55,
218 "Id7": "2019-07-09 22:32:33.135693+00:00",
219 "Id8" : "cool"
220 }
221 """
223 df = Dataframe(SDCDFTypes.PANDAS, test_schema)
224 df.load_data([json.loads(test_data)])
225 assert("ID9" in df.df.columns)
228def test_find_json_columns(mocker):
230 test_schema = """
231 {
232 "namespace": "",
233 "type": "object",
234 "name": "",
235 "country_code": "USA",
236 "data_sink": {"type":"snowflake", "database": "", "table_name": "", "schema": ""},
237 "data_source": {"type": "api", "base_url": ""},
238 "fields": [
239 {"name":"CODE","type":{"type":"string"}},
240 {"name":"DESCRIPTION","type":{"type":"string"}},
241 {"name":"WBS","type":{"type":"string","logical_type":"json"}},
242 {"name":"BANKS","type":{"type":"string","logical_type":"json"}},
243 {"name":"LASTMODIFIEDAT","type":{"type":"string","logical_type":"datetime"}},
244 {"name":"LASTMODIFIEDBY","type":{"type":"string"}},
245 {"name":"BUDGETHOURS","type":{"type":"float"}},
246 {"name":"BUDGETCOST","type":{"type":"float"}},
247 {"name":"STARTDATE","type":{"type":"string", "logical_type":"datetime"}, "is_nullable": true},
248 {"name":"FINISHDATE","type":{"type":"string", "logical_type":"datetime"}, "is_nullable": true},
249 {"name":"PARENTKEY","type":{"type":"int"}, "is_nullable": true},
250 {"name":"STATUS","type":{"type":"string"}},
251 {"name":"PROJECT","type":{"type":"string","logical_type":"json"}},
252 {"name":"USERFIELDS","type":{"type":"string","logical_type":"json"}},
253 {"name":"_METADATA","type":{"type":"string","logical_type":"json"}},
254 {"name":"KEY","type":{"type":"int"},"sf_merge_key": true},
255 {"name":"_SF_INSERTEDDATETIME","type":{"type":"string","logical_type":"datetime", "add_column": true }}
256 ]
257 }"""
259 df = Dataframe(SDCDFTypes.PANDAS, test_schema)
260 assert len(df.json_columns) == 5
263def test_renaming_fields():
264 test_schema = """
265 {
266 "namespace": "Test",
267 "name": "test",
268 "type": "object",
269 "country_code": "USA",
270 "estimated_row_size": "10b",
271 "estimated_row_count": 3000,
272 "contains_pii": false,
273 "fields": [
274 {
275 "name": "FIELD1",
276 "rename": "ID",
277 "type": {
278 "type": "int"
279 }
280 },
281 {
282 "name": "FIELD2",
283 "rename": "NAME",
284 "type": {
285 "type": "string"
286 }
287 }
288 ]
289 }
290 """
291 test_data = """
292 {
293 "FIELD1": 1,
294 "FIELD2": "foo"
295 }
296 """
297 df = Dataframe(SDCDFTypes.PANDAS, test_schema)
298 df.load_data([json.loads(test_data)])
299 assert df.df.iloc[0]["ID"] == 1
300 assert df.df.iloc[0]["NAME"] == "foo"
301 assert "FIELD1" not in df.df.columns
302 assert "FIELD2" not in df.df.columns
305def test_default_value():
306 test_schema = """
307 {
308 "namespace": "Test",
309 "name": "test",
310 "type": "object",
311 "country_code": "USA",
312 "estimated_row_size": "10b",
313 "estimated_row_count": 3000,
314 "contains_pii": false,
315 "fields": [
316 {
317 "name": "ID",
318 "type": {
319 "type": "int"
320 },
321 "default_value": 1
322 },
323 {
324 "name": "NAME",
325 "type": {
326 "type": "string"
327 },
328 "default_value": "foo"
329 }
330 ]
331 }
332 """
333 test_data = """
334 {
335 "ID": null,
336 "NAME": ""
337 }
338 """
339 df = Dataframe(SDCDFTypes.PANDAS, test_schema)
340 df.load_data([json.loads(test_data)])
341 assert df.df.iloc[0]["ID"] == 1
342 assert df.df.iloc[0]["NAME"] == "foo"
345def test_default_value_false():
346 test_schema = """
347 {
348 "namespace": "Test",
349 "name": "test",
350 "type": "object",
351 "country_code": "USA",
352 "estimated_row_size": "10b",
353 "estimated_row_count": 3000,
354 "contains_pii": false,
355 "fields": [
356 {
357 "name": "ID",
358 "type": {
359 "type": "int"
360 },
361 "default_value": 0
362 },
363 {
364 "name": "TEST",
365 "type": {
366 "type": "boolean"
367 },
368 "default_value": false
369 }
370 ]
371 }
372 """
373 test_data = """
374 {
375 "ID": null,
376 "TEST": null
377 }
378 """
379 df = Dataframe(SDCDFTypes.PANDAS, test_schema)
380 df.load_data([json.loads(test_data)])
381 assert df.df.iloc[0]["ID"] == 0
382 assert df.df.iloc[0]["TEST"] == False
385def test_rename_with_default_value():
386 test_schema = """
387 {
388 "namespace": "Test",
389 "name": "test",
390 "type": "object",
391 "country_code": "USA",
392 "estimated_row_size": "10b",
393 "estimated_row_count": 3000,
394 "contains_pii": false,
395 "fields": [
396 {
397 "name": "FIELD1",
398 "rename": "ID",
399 "type": {
400 "type": "int"
401 },
402 "default_value": 1
403 },
404 {
405 "name": "FIELD2",
406 "rename": "NAME",
407 "type": {
408 "type": "string"
409 },
410 "default_value": "foo"
411 }
412 ]
413 }
414 """
415 test_data = """
416 {
417 "FIELD1": null,
418 "FIELD2": null
419 }
420 """
421 df = Dataframe(SDCDFTypes.PANDAS, test_schema)
422 df.load_data([json.loads(test_data)])
423 assert df.df.iloc[0]["ID"] == 1
424 assert df.df.iloc[0]["NAME"] == "foo"
425 assert "FIELD1" not in df.df.columns
426 assert "FIELD2" not in df.df.columns
429def test_boolean_null_values():
430 test_schema = """
431 {
432 "namespace": "Test",
433 "name": "test",
434 "type": "object",
435 "country_code": "USA",
436 "estimated_row_size": "10b",
437 "estimated_row_count": 3000,
438 "contains_pii": false,
439 "fields": [
440 {
441 "name": "TEST",
442 "type": {
443 "type": "boolean"
444 }
445 }
446 ]
447 }
448 """
449 test_data = [
450 {
451 "TEST": "TRUE"
452 },
453 {
454 "TEST": "FALSE"
455 },
456 {
457 "TEST": "1"
458 },
459 {
460 "TEST": "0"
461 },
462 {
463 "TEST": "Yes"
464 },
465 {
466 "TEST": "N"
467 },
468 {
469 "TEST": True
470 },
471 {
472 "TEST": False
473 },
474 {
475 "TEST": 1
476 },
477 {
478 "TEST": 0
479 },
480 {
481 "TEST": None
482 }
483 ]
484 df = Dataframe(SDCDFTypes.PANDAS, test_schema)
485 df.load_data(test_data)
486 assert df.df.iloc[0]["TEST"] == True
487 assert df.df.iloc[1]["TEST"] == False
488 assert df.df.iloc[2]["TEST"] == True
489 assert df.df.iloc[3]["TEST"] == False
490 assert df.df.iloc[4]["TEST"] == True
491 assert df.df.iloc[5]["TEST"] == False
492 assert df.df.iloc[6]["TEST"] == True
493 assert df.df.iloc[7]["TEST"] == False
494 assert df.df.iloc[8]["TEST"] == True
495 assert df.df.iloc[9]["TEST"] == False
496 assert pd.isnull(df.df.iloc[10]["TEST"])
498def test_add_column_with_default_value():
499 test_schema = """
500 {
501 "namespace": "Test",
502 "name": "test",
503 "type": "object",
504 "country_code": "USA",
505 "estimated_row_size": "10b",
506 "estimated_row_count": 3000,
507 "contains_pii": false,
508 "fields": [
509 {
510 "name": "ID",
511 "type": {
512 "type": "int"
513 }
514 },
515 {
516 "name": "TEST",
517 "type": {
518 "type": "string"
519 },
520 "add_column": true,
521 "default_value": "foo"
522 }
523 ]
524 }
525 """
526 test_data = [
527 {
528 "ID": 1
529 },
530 {
531 "ID": 2
532 },
533 {
534 "ID": 3
535 },
536 ]
537 df = Dataframe(SDCDFTypes.PANDAS, test_schema)
538 df.load_data(test_data)
539 assert df.df["TEST"].all() == "foo"
542def test_add_column_current_timestamp():
543 test_schema = """
544 {
545 "namespace": "Test",
546 "name": "test",
547 "type": "object",
548 "country_code": "USA",
549 "estimated_row_size": "10b",
550 "estimated_row_count": 3000,
551 "contains_pii": false,
552 "fields": [
553 {
554 "name": "ID",
555 "type": {
556 "type": "int"
557 }
558 },
559 {
560 "name": "_SF_INSERTEDDATETIME",
561 "type": {
562 "type": "string",
563 "logical_type": "datetime"
564 },
565 "add_column": true
566 }
567 ]
568 }
569 """
570 test_data = [
571 {
572 "ID": 1
573 }
574 ]
575 df = Dataframe(SDCDFTypes.PANDAS, test_schema)
576 start_time = datetime.datetime.utcnow()
577 df.load_data(test_data)
578 end_time = datetime.datetime.utcnow()
579 assert df.df.iloc[0]["_SF_INSERTEDDATETIME"] >= start_time
580 assert df.df.iloc[0]["_SF_INSERTEDDATETIME"] <= end_time
581 assert df.df["_SF_INSERTEDDATETIME"].dtype == SDCDFPandasTypes.datetime.value \
582 or df.df["_SF_INSERTEDDATETIME"].dtype == np.dtype('>M8[ns]') \
583 or df.df["_SF_INSERTEDDATETIME"].dtype == np.dtype('<M8[ns]')