Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1import logging 

2import os 

3import re 

4from collections import namedtuple 

5from datetime import datetime 

6from sdc_etl_libs.sdc_data_exchange.SDCDataExchangeEnums import FileResultTypes 

7 

8 

9class Fields: 

10 """ 

11 Class with the SERRA Fields identifiers as Constants 

12 """ 

13 # metadata 

14 CASE_NUMBER = "CASE_NUMBER" 

15 CASE_ID = "CASE_ID" 

16 EMAIL = "PATIENT_EMAIL" 

17 LAB_ID = "LAB_CASE_ID" 

18 LAB = "LAB" 

19 SERRA_PATH = "SERRA_PATH" 

20 # assets 

21 CLOSED_STRAIGHT = "PHOTOS_CLOSED_STRAIGHT" 

22 OPEN_LOWER = "PHOTOS_OPEN_LOWER" 

23 OPEN_UPPER = "PHOTOS_OPEN_UPPER" 

24 OBJ = "OBJ" 

25 DESTINATION_CLOSED_STRAIGHT = "Closed_img_path" 

26 DESTINATION_OPEN_LOWER = "Upper_arch_img_path" 

27 DESTINATION_OPEN_UPPER = "Lower_arch_img_path" 

28 DESTINATION_OBJ = "Obj_path" 

29 

30 Asset = namedtuple('Asset', ["identifier", "position"]) 

31 assets_required = [Asset(OBJ, "obj"), 

32 Asset(CLOSED_STRAIGHT, "closed"), 

33 Asset(OPEN_UPPER, "upper_arch"), 

34 Asset(OPEN_LOWER, "lower_arch")] 

35 

36 destination_columns = { 

37 CLOSED_STRAIGHT: DESTINATION_CLOSED_STRAIGHT, 

38 OPEN_UPPER: DESTINATION_OPEN_UPPER, 

39 OPEN_LOWER: DESTINATION_OPEN_LOWER, 

40 OBJ: DESTINATION_OBJ, 

41 SERRA_PATH: SERRA_PATH 

42 } 

43 

44 

45class Metrics: 

46 """ 

47 Class with the Metrics identifiers as Constants 

48 """ 

49 SERRA_TOTAL = "SERRA_CASES_TO_MOVE" 

50 RETRY_CASES = "RETRY_CASES_TO_MOVE" 

51 TOTAL_DISTINCT_CASES_TO_MOVE = "TOTAL_DISTINCT_CASES_TO_MOVE" 

52 SERRA_CASES_MOVED = "SERRA_CASES_MOVED" 

53 RETRY_CASES_MOVED = "RETRY_CASES_MOVED" 

54 TOTAL_CASES_MOVED = "TOTAL_CASES_MOVED" 

55 TOTAL_CASES_NOT_MOVED = "TOTAL_CASES_NOT_MOVED" 

56 RETRY_CASES_EXPIRED = "RETRY_CASES_EXPIRED" 

57 CASES_WITH_OBJ_AND_IMG = "CASES_WITH_OBJ_AND_IMG" 

58 CASES_WITH_OBJ = "CASES_WITH_OBJ" 

59 

60 EXPIRED_CASE_CODE = 404 

61 MOVED_CASE_CODE = 200 

62 

63 

64def add_columns_to_df(df_, new_column_names_): 

65 """ 

66 Add new columns to an existing pandas dataframe 

67 :param df_: Pandas DataFrame 

68 :param new_column_names_: List. List of strings containing the new column names 

69 :return: Pandas DataFrame. 

70 """ 

71 for new_column in new_column_names_: 

72 df_[new_column] = "" 

73 return df_ 

74 

75 

76def filter_data(df_, serra_info_): 

77 """ 

78 Filter a dataframe with asset_service information so only not null emails and serra_emails are kept 

79 :param df_: Pandas DataFrame 

80 :param serra_info_: dict. Contains serra_emails as the keys of the dictionary 

81 :return: Pandas DataFrame. 

82 """ 

83 return df_[df_[Fields.EMAIL].notnull()][df_[Fields.EMAIL].isin(list(serra_info_.keys()))] 

84 

85 

86def is_a_valid_asset(element_, position_, serra_creation_date_, asset_create_date_, asset_path_): 

87 """ 

88 Determines if this is a valid asset after applying nullity and time range criteria 

89 :param element_: dict 

90 :param position_: string. Position of the asset (obj or image) 

91 :param serra_creation_date_: string. Creation date of the asset reported by serra 

92 :param asset_create_date_: string. Creation date of the asset reported by the asset_service 

93 :param asset_path_: string. S3 path of the asset 

94 :return: Boolean. 

95 """ 

96 if not isinstance(element_[asset_path_], str): 

97 logging.error( 

98 f"{FileResultTypes.error.value}: CASE_NUMBER: {element_[Fields.CASE_NUMBER]} no files available for asset {asset_path_}" 

99 ) 

100 return False 

101 if element_[asset_create_date_] == 'NaT': 

102 logging.error( 

103 f"{FileResultTypes.error.value}: CASE_NUMBER: {element_[Fields.CASE_NUMBER]} no creation time available for {asset_path_}" 

104 ) 

105 return False 

106 serra_time = datetime.strptime(serra_creation_date_, '%Y-%m-%dT%H:%M:%SZ') 

107 asset_service_time = datetime.strptime( 

108 element_[asset_create_date_].split("+")[0], 

109 '%Y-%m-%d %H:%M:%S.%f' if "." in element_[asset_create_date_] else '%Y-%m-%d %H:%M:%S') 

110 time_difference = (asset_service_time - serra_time).days 

111 min_days = -3 if position_ != "obj" else 0 

112 return min_days <= time_difference <= 14 

113 

114 

115def get_image_file_extension(asset_path_): 

116 """ 

117 Extract the file extension of an image asset stored in S3. 

118 There are two kind of image file names: 

119 1. Images with a valid extension (jpg, png, jpeg). The method returns directly the extension 

120 2. Images without a file extension. Those cases are usually images related with MCC or Refinements, 

121 for example "2020-01-11 17:03:37.041147+00:00_OPEN_UP_OUT_REFINE_C000162b0e6e04". 

122 In those cases the method returns the description of the photo ('OPEN_UP_OUT_REFINE' substring in the example) 

123 and appends a default extension (.jpeg) 

124 :param asset_path_: string. S3 Path of the asset 

125 :return: string. 

126 """ 

127 file_extension = os.path.splitext(asset_path_)[1] 

128 return file_extension if len(file_extension) <= 5 else re.search(r'(_.*)_', file_extension).group(1) + ".jpeg"