Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1import pandas as pd 

2 

3from sdc_etl_libs.sdc_dataframe.udfs.pandas.CollapseColumnsPandasUDF import \ 

4 CollapseColumnsPandasUDF 

5from sdc_etl_libs.sdc_dataframe.udfs.pandas.ConcatColumnsPandasUDF import \ 

6 ConcatColumnsPandasUDF 

7from sdc_etl_libs.sdc_dataframe.udfs.pandas.MD5HashRowPandasUDF import \ 

8 MD5HashRowPandasUDF 

9from sdc_etl_libs.sdc_dataframe.udfs.pandas.ConvertToDatetimePandasUDF import \ 

10 ConvertToDatetimePandasUDF 

11 

12 

13def test_concat_columns_pandas_udf(): 

14 opts = {"list_columns": ["ID1", "ID2"], "separator": " "} 

15 

16 s = pd.Series(["abc", "def"], index=['ID1', 'ID2']) 

17 assert ConcatColumnsPandasUDF.apply_udf(row_=s, **opts) == "abc def" 

18 

19 s = pd.Series([3, "def"], index=['ID1', 'ID2']) 

20 assert ConcatColumnsPandasUDF.apply_udf(row_=s, **opts) == "3 def" 

21 

22 

23def test_collapse_columns_pandas_udf(): 

24 opts = {'list_column_prefix': ['param']} 

25 data = { 

26 'param_1': [1, 2, 3, 4], 

27 'param_2': [{ 

28 '4': 4 

29 }, { 

30 '5': 5 

31 }, { 

32 '6': 6 

33 }, { 

34 '7': 7 

35 }], 

36 'param_3': ['a', 'b', 'c', 'd'] 

37 } 

38 df = pd.DataFrame(data=data, columns=['param_1', 'param_2', 'param_3']) 

39 CollapseColumnsPandasUDF.apply_udf(df, **opts) 

40 assert 'param' in df 

41 assert 'param_1' not in df 

42 assert 'param_2' not in df 

43 assert 'param_3' not in df 

44 

45 

46def test_md5_hash_row_pandas_udf(): 

47 data = { 

48 'param_1': [1, 2, 3, 4], 

49 'param_2': [{ 

50 '4': 4 

51 }, { 

52 '5': 5 

53 }, { 

54 '6': 6 

55 }, { 

56 '7': 7 

57 }], 

58 'param_3': ['a', 'b', 'c', 'd'] 

59 } 

60 df = pd.DataFrame(data=data, columns=['param_1', 'param_2', 'param_3']) 

61 opts = {} 

62 df['HASHMD5'] = df.apply(lambda row: MD5HashRowPandasUDF.apply_udf(row, **opts), axis='columns') 

63 df['HASHMD5'].iloc[0] == '464f331d577d350fdbc25a3b14260b5c' 

64 df['HASHMD5'].iloc[1] == 'bd17965fe4e6ca56e418d411b457e68c' 

65 df['HASHMD5'].iloc[2] == '7edfb7d75c50beba0149252f0fa35144' 

66 df['HASHMD5'].iloc[3] == 'edd0d84725107721add7e40a3ab95ddb' 

67 

68 

69def test_convert_unix_timestamp_pandas_udf(): 

70 data = { 

71 "ID": [1, 2], 

72 "CREATED": [1608582690, 1608582719], 

73 } 

74 df = pd.DataFrame(data=data, columns=['ID', 'CREATED']) 

75 opts = { 

76 "origin_column_name": "CREATED", 

77 "to_datetime_params": {"unit": "s"}, 

78 "origin_timezone": "UTC", 

79 "target_timezone": "America/Chicago" 

80 } 

81 df["CREATED"] = df.apply(lambda row: ConvertToDatetimePandasUDF.apply_udf(row_=row, **opts), axis='columns') 

82 print(data) 

83 df["CREATED"].iloc[0] == '2020-12-21 14:31:30' 

84 df["CREATED"].iloc[1] == '2020-12-21 14:31:59'