Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1""" 

2This module connects to sharepoint. 

3""" 

4import logging 

5 

6import requests 

7 

8from sdc_etl_libs.api_helpers.OAuthAPI import OAuthAPI 

9from sdc_etl_libs.sdc_dataframe.Dataframe import Dataframe 

10from sdc_etl_libs.sdc_dataframe.SDCDataframeEnums import SDCDFTypes 

11from sdc_etl_libs.sdc_file_helpers.SDCExcelFile import SDCExcelFile 

12 

13if len(logging.getLogger().handlers) > 0: 

14 logging.getLogger().setLevel(logging.INFO) 

15else: 

16 logging.basicConfig( 

17 level=logging.INFO, 

18 format="%(levelname)s: %(asctime)s: %(funcName)s: %(message)s", 

19 ) 

20 

21 

22class Sharepoint(OAuthAPI): 

23 """ 

24 Sharepoint class. Used for communicating with either Sharepoint or OneDrive 

25 """ 

26 

27 def __init__(self, schema_: dict, endpoint_schema_: dict, **kwargs): 

28 auth_ = kwargs.get("auth_") if kwargs.get("auth_") is not None else {} 

29 self.RECORDS_PER_REQUEST = 5000 # current max rows sharepoint will return 

30 self.schema = schema_ 

31 self.endpoint_schema = endpoint_schema_ 

32 self.file_name = self.endpoint_schema["info"]["file_info"]["opts"]["file_name"] 

33 self.file_type = self.endpoint_schema["info"]["file_info"]["type"] 

34 scope = self.endpoint_schema["info"]["access"]["scope"] 

35 

36 credentials = self.endpoint_schema["info"]["access"]["credentials"] \ 

37 if self.endpoint_schema.get("info", {}).get("access", {}).get("credentials") is not None \ 

38 else { 

39 'type': 'aws_secrets', 

40 "opts": { 

41 "name": "SB/Sharepoint/API" 

42 } 

43 } 

44 

45 super().__init__( 

46 refresh_token_="", # we don't need it and it cannot be null otherwise super() will try to acquire it 

47 client_id_=auth_.get('client_id'), 

48 client_secret_=auth_.get('client_secret'), 

49 scope_=auth_.get('scope', scope), 

50 credential_type_=credentials.get('type'), 

51 credential_id_=credentials["opts"].get('name'), 

52 region_=endpoint_schema_['info']['access'].get('region', "us-east-2")) 

53 

54 self.base_url = self.endpoint_schema["info"]["access"]['base_url'] if self.endpoint_schema.get('info', {}).get( 

55 'access', {}).get('base_url') else "https://graph.microsoft.com/v1.0" 

56 

57 self.access_token_url = (self.endpoint_schema['info']['access']['token_url'] \ 

58 if self.endpoint_schema['info']['access'].get( 

59 'token_url') else "https://login.microsoftonline.com/{}/oauth2/v2.0/token").format(self.tenant_id) 

60 

61 self.endpoint_name = (self.endpoint_schema["info"]["access"]['endpoint_name'] if self.endpoint_schema.get( 

62 'info', {}).get('access', {}) \ 

63 .get('endpoint_name') else "smiledirectclub.sharepoint.com/sites/sdcanalytics") \ 

64 .replace(".com/sites", ".com:/sites") 

65 

66 self.get_access_token() 

67 

68 self.method = self.endpoint_schema.get('info', {}).get('opts', {}).get('api_call_details', {}).get('method') 

69 if self.method is None: 

70 raise ValueError('No method provided.') 

71 

72 # Getting sharepoint "site" id 

73 url = f"{self.base_url}/sites/{self.endpoint_name}" 

74 resp = requests.get(url, headers=self.headers) 

75 self.site_id = resp.json().get('id').split(',')[1] 

76 

77 def __get_excel_file(self): 

78 """ 

79 This method retrieves Excel from Sharepoint 

80 :rtype: Dataframe 

81 """ 

82 path_hierarchy = self.file_name.split('/') 

83 item_id = None 

84 for idx, item in enumerate(path_hierarchy): 

85 

86 if idx == 0: 

87 # getting list of files accessible on the site in the root folder 

88 url = f'{self.base_url}/sites/{self.site_id}/drive/root/children?$select=id,name,file,folder' 

89 else: 

90 # getting list of files accessible on the site in the subfolder 

91 url = f'{self.base_url}/sites/{self.site_id}/drive/items/{item_id}/children?$select=id,name,file,folder' 

92 

93 resp = requests.get(url, headers=self.headers) 

94 avail_objects = resp.json().get('value') 

95 

96 # getting item_id for the subfolder or the requested file 

97 item_id = [x['id'] for x in avail_objects if x['name'] == item][0] 

98 

99 # retrieving file content 

100 url = f'{self.base_url}/sites/{self.site_id}/drive/items/{item_id}' 

101 file_meta = requests.get(url, headers=self.headers) 

102 

103 # getting content 

104 url = file_meta.json().get('@microsoft.graph.downloadUrl') 

105 data = requests.get(url, headers=self.headers) 

106 excel_file = SDCExcelFile( 

107 schema_=self.schema, 

108 endpoint_schema_=self.endpoint_schema, 

109 file_name_=None, 

110 file_path_=None, 

111 file_obj_=data.content) 

112 excel_file_df = excel_file.get_file_as_dataframe() 

113 

114 return excel_file_df 

115 

116 def __get_sharepoint_list(self): 

117 """ 

118 This method scan a Sharepoint list, its columns and items a retrieve the corresponding information 

119 :rtype: Dataframe 

120 """ 

121 # get list id 

122 list_id = self.__get_list_id() 

123 # get valid columns 

124 valid_columns = self.__get_list_columns(list_id) 

125 # get list records 

126 list_records = self.__get_list_records(list_id, valid_columns) 

127 sdc_df = Dataframe(SDCDFTypes.PANDAS, self.schema) 

128 sdc_df.load_data(list_records) 

129 

130 return sdc_df 

131 

132 def __get_list_id(self): 

133 list_id_url = f'{self.base_url}/sites/{self.site_id}/lists?$select=id,name' 

134 resp = requests.get(list_id_url, headers=self.headers) 

135 avail_objects = resp.json().get('value') 

136 list_id = [x['id'] for x in avail_objects if x['name'] == self.file_name][0] 

137 return list_id 

138 

139 def __get_list_columns(self, list_id_): 

140 # get the columns and their metadata to check if we will need to include those in our dataset 

141 list_columns_url = f'{self.base_url}/sites/{self.site_id}/lists/{list_id_}?$select=id,name&expand=columns,items(expand=fields)' 

142 resp = requests.get(list_columns_url, headers=self.headers) 

143 list_columns_meta = resp.json().get('columns') 

144 list_columns = [ 

145 x['name'] for x in list_columns_meta if x['hidden'] is False and x['readOnly'] is False 

146 and x['columnGroup'] != '_Hidden' and x['name'] != 'Attachments' 

147 ] 

148 return list_columns 

149 

150 def __get_list_records(self, list_id_, list_columns_): 

151 """ 

152 This method retrieves Sharepoint List records through pagination 

153 :rtype: Dataframe 

154 """ 

155 list_items_url = f'{self.base_url}/sites/{self.site_id}/lists/{list_id_}/items?$top={self.RECORDS_PER_REQUEST}&$&expand=columns,items(expand=fields)' 

156 list_records = [] 

157 while True: 

158 resp = requests.get(list_items_url, headers=self.headers) 

159 list_records_meta = resp.json().get("value") 

160 

161 for rec_meta in list_records_meta: 

162 fields = rec_meta.get('fields') 

163 rec = {} 

164 for field, value in fields.items(): 

165 if field in list_columns_: 

166 rec[field] = value 

167 list_records.append(rec) 

168 

169 next_page = resp.json().get("@odata.nextLink") 

170 

171 if next_page is None: 

172 break 

173 else: 

174 list_items_url = next_page 

175 return list_records 

176 

177 def get_data(self): 

178 """ 

179 This method retrieves data from Sharepoint 

180 :rtype: Dataframe 

181 """ 

182 if self.file_type == 'excel': 

183 sdc_df = self.__get_excel_file() 

184 elif self.file_type == 'sharepoint_list': 

185 sdc_df = self.__get_sharepoint_list() 

186 return sdc_df 

187 

188 def post_data(self): 

189 """ 

190 This is a placeholder for a method that posts data to either Sharepoint or OneDrive 

191 :raises NotImplementedError: 

192 """ 

193 raise NotImplementedError('Method POST is not implemented.') 

194 

195 def get_response_data(self): 

196 """ 

197 This is a wrapper method called from API.py and driven by schema 

198 :return: data returned from remote API 

199 :rtype: str 

200 :raises ValueError: 

201 """ 

202 data = None 

203 if self.method == "get": 

204 data = self.get_data() 

205 

206 if self.method == "post": 

207 data = self.post_data() 

208 

209 return data