Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1import functools 

2import logging 

3 

4from sdc_etl_libs.sdc_dataframe.udfs.pandas.PandasUDF import PandasUDF 

5 

6 

7class CollapseColumnsPandasUDF(PandasUDF): 

8 

9 def apply_udf(df_, **opts_): 

10 """Columns values with a defined prefix are selected and packed into a single value 

11 param: df_ : dataframe 

12 type: column_: pandas.Dataframe 

13 param: opts_: contains "list_column_prefix" 

14 type: opts: dict 

15 - "list_column_prefix" example: "list_column_prefix"= "['ANYTHING','CAN','COLLAPSE']" 

16 return: str 

17 """ 

18 

19 def concat_columns_as_json(row, list_columns): 

20 """Assemble a dict by reducing a list of columns by column and value 

21 param: row : dataframe row 

22 type: row: pandas.Series 

23 param: list_columns: list of columns to reduce 

24 type: opts: list 

25 return: dict 

26 """ 

27 list_dicts = [{column: str(row[column]).replace('$$', '')} for column in list_columns] 

28 return functools.reduce(lambda dict_a, dict_b: dict(dict_a, **dict_b), list_dicts, {}) 

29 

30 def drop_columns(df_, list_columns_): 

31 """Drop collapsed columns 

32 param: df_: dataframe 

33 type: column_: pandas.Dataframe 

34 param: list_columns: list of columns to supress 

35 type: opts: list 

36 return: None 

37 """ 

38 for column in list_columns_: 

39 del df_[column] 

40 

41 def collapse_columns(df_, column_prefix_): 

42 """Collapse columns 

43 param: df_: dataframe 

44 type: column_: pandas.Dataframe 

45 param: column_prefix_: list of columns to reduce 

46 type: opts: list 

47 return: None 

48 """ 

49 # Extract columns sharing prefix 

50 columns_with_same_prefix = [column for column in df_.columns if column.startswith(column_prefix_)] 

51 # For each column reduce by column and value 

52 df_[column_prefix_] = df_.apply( 

53 lambda r: concat_columns_as_json(r, columns_with_same_prefix), axis='columns') 

54 # Delete collapsed columns 

55 drop_columns(df_, columns_with_same_prefix) 

56 

57 # Iterate over a list of prefixes collapsing columns by prefix 

58 for column_prefix in opts_['list_column_prefix']: 

59 logging.info('collapsing %s', column_prefix) 

60 collapse_columns(df_, column_prefix)