Coverage for libs/sdc_etl_libs/api_helpers/apis/Sharepoint/sharepoint.py : 79%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1"""
2This module connects to sharepoint.
3"""
4import logging
6import requests
8from sdc_etl_libs.api_helpers.OAuthAPI import OAuthAPI
9from sdc_etl_libs.sdc_dataframe.Dataframe import Dataframe
10from sdc_etl_libs.sdc_dataframe.SDCDataframeEnums import SDCDFTypes
11from sdc_etl_libs.sdc_file_helpers.SDCExcelFile import SDCExcelFile
13if len(logging.getLogger().handlers) > 0:
14 logging.getLogger().setLevel(logging.INFO)
15else:
16 logging.basicConfig(
17 level=logging.INFO,
18 format="%(levelname)s: %(asctime)s: %(funcName)s: %(message)s",
19 )
22class Sharepoint(OAuthAPI):
23 """
24 Sharepoint class. Used for communicating with either Sharepoint or OneDrive
25 """
27 def __init__(self, schema_: dict, endpoint_schema_: dict, **kwargs):
28 auth_ = kwargs.get("auth_") if kwargs.get("auth_") is not None else {}
29 self.RECORDS_PER_REQUEST = 5000 # current max rows sharepoint will return
30 self.schema = schema_
31 self.endpoint_schema = endpoint_schema_
32 self.file_name = self.endpoint_schema["info"]["file_info"]["opts"]["file_name"]
33 self.file_type = self.endpoint_schema["info"]["file_info"]["type"]
34 scope = self.endpoint_schema["info"]["access"]["scope"]
36 credentials = self.endpoint_schema["info"]["access"]["credentials"] \
37 if self.endpoint_schema.get("info", {}).get("access", {}).get("credentials") is not None \
38 else {
39 'type': 'aws_secrets',
40 "opts": {
41 "name": "SB/Sharepoint/API"
42 }
43 }
45 super().__init__(
46 refresh_token_="", # we don't need it and it cannot be null otherwise super() will try to acquire it
47 client_id_=auth_.get('client_id'),
48 client_secret_=auth_.get('client_secret'),
49 scope_=auth_.get('scope', scope),
50 credential_type_=credentials.get('type'),
51 credential_id_=credentials["opts"].get('name'),
52 region_=endpoint_schema_['info']['access'].get('region', "us-east-2"))
54 self.base_url = self.endpoint_schema["info"]["access"]['base_url'] if self.endpoint_schema.get('info', {}).get(
55 'access', {}).get('base_url') else "https://graph.microsoft.com/v1.0"
57 self.access_token_url = (self.endpoint_schema['info']['access']['token_url'] \
58 if self.endpoint_schema['info']['access'].get(
59 'token_url') else "https://login.microsoftonline.com/{}/oauth2/v2.0/token").format(self.tenant_id)
61 self.endpoint_name = (self.endpoint_schema["info"]["access"]['endpoint_name'] if self.endpoint_schema.get(
62 'info', {}).get('access', {}) \
63 .get('endpoint_name') else "smiledirectclub.sharepoint.com/sites/sdcanalytics") \
64 .replace(".com/sites", ".com:/sites")
66 self.get_access_token()
68 self.method = self.endpoint_schema.get('info', {}).get('opts', {}).get('api_call_details', {}).get('method')
69 if self.method is None:
70 raise ValueError('No method provided.')
72 # Getting sharepoint "site" id
73 url = f"{self.base_url}/sites/{self.endpoint_name}"
74 resp = requests.get(url, headers=self.headers)
75 self.site_id = resp.json().get('id').split(',')[1]
77 def __get_excel_file(self):
78 """
79 This method retrieves Excel from Sharepoint
80 :rtype: Dataframe
81 """
82 path_hierarchy = self.file_name.split('/')
83 item_id = None
84 for idx, item in enumerate(path_hierarchy):
86 if idx == 0:
87 # getting list of files accessible on the site in the root folder
88 url = f'{self.base_url}/sites/{self.site_id}/drive/root/children?$select=id,name,file,folder'
89 else:
90 # getting list of files accessible on the site in the subfolder
91 url = f'{self.base_url}/sites/{self.site_id}/drive/items/{item_id}/children?$select=id,name,file,folder'
93 resp = requests.get(url, headers=self.headers)
94 avail_objects = resp.json().get('value')
96 # getting item_id for the subfolder or the requested file
97 item_id = [x['id'] for x in avail_objects if x['name'] == item][0]
99 # retrieving file content
100 url = f'{self.base_url}/sites/{self.site_id}/drive/items/{item_id}'
101 file_meta = requests.get(url, headers=self.headers)
103 # getting content
104 url = file_meta.json().get('@microsoft.graph.downloadUrl')
105 data = requests.get(url, headers=self.headers)
106 excel_file = SDCExcelFile(
107 schema_=self.schema,
108 endpoint_schema_=self.endpoint_schema,
109 file_name_=None,
110 file_path_=None,
111 file_obj_=data.content)
112 excel_file_df = excel_file.get_file_as_dataframe()
114 return excel_file_df
116 def __get_sharepoint_list(self):
117 """
118 This method scan a Sharepoint list, its columns and items a retrieve the corresponding information
119 :rtype: Dataframe
120 """
121 # get list id
122 list_id = self.__get_list_id()
123 # get valid columns
124 valid_columns = self.__get_list_columns(list_id)
125 # get list records
126 list_records = self.__get_list_records(list_id, valid_columns)
127 sdc_df = Dataframe(SDCDFTypes.PANDAS, self.schema)
128 sdc_df.load_data(list_records)
130 return sdc_df
132 def __get_list_id(self):
133 list_id_url = f'{self.base_url}/sites/{self.site_id}/lists?$select=id,name'
134 resp = requests.get(list_id_url, headers=self.headers)
135 avail_objects = resp.json().get('value')
136 list_id = [x['id'] for x in avail_objects if x['name'] == self.file_name][0]
137 return list_id
139 def __get_list_columns(self, list_id_):
140 # get the columns and their metadata to check if we will need to include those in our dataset
141 list_columns_url = f'{self.base_url}/sites/{self.site_id}/lists/{list_id_}?$select=id,name&expand=columns,items(expand=fields)'
142 resp = requests.get(list_columns_url, headers=self.headers)
143 list_columns_meta = resp.json().get('columns')
144 list_columns = [
145 x['name'] for x in list_columns_meta if x['hidden'] is False and x['readOnly'] is False
146 and x['columnGroup'] != '_Hidden' and x['name'] != 'Attachments'
147 ]
148 return list_columns
150 def __get_list_records(self, list_id_, list_columns_):
151 """
152 This method retrieves Sharepoint List records through pagination
153 :rtype: Dataframe
154 """
155 list_items_url = f'{self.base_url}/sites/{self.site_id}/lists/{list_id_}/items?$top={self.RECORDS_PER_REQUEST}&$&expand=columns,items(expand=fields)'
156 list_records = []
157 while True:
158 resp = requests.get(list_items_url, headers=self.headers)
159 list_records_meta = resp.json().get("value")
161 for rec_meta in list_records_meta:
162 fields = rec_meta.get('fields')
163 rec = {}
164 for field, value in fields.items():
165 if field in list_columns_:
166 rec[field] = value
167 list_records.append(rec)
169 next_page = resp.json().get("@odata.nextLink")
171 if next_page is None:
172 break
173 else:
174 list_items_url = next_page
175 return list_records
177 def get_data(self):
178 """
179 This method retrieves data from Sharepoint
180 :rtype: Dataframe
181 """
182 if self.file_type == 'excel':
183 sdc_df = self.__get_excel_file()
184 elif self.file_type == 'sharepoint_list':
185 sdc_df = self.__get_sharepoint_list()
186 return sdc_df
188 def post_data(self):
189 """
190 This is a placeholder for a method that posts data to either Sharepoint or OneDrive
191 :raises NotImplementedError:
192 """
193 raise NotImplementedError('Method POST is not implemented.')
195 def get_response_data(self):
196 """
197 This is a wrapper method called from API.py and driven by schema
198 :return: data returned from remote API
199 :rtype: str
200 :raises ValueError:
201 """
202 data = None
203 if self.method == "get":
204 data = self.get_data()
206 if self.method == "post":
207 data = self.post_data()
209 return data