Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1import base64 

2import datetime 

3import io 

4import logging 

5import operator 

6import os 

7import re 

8 

9import boto3 

10from botocore.exceptions import ClientError 

11from dateutil import parser 

12 

13 

14class SDCS3: 

15 

16 def __init__(self): 

17 

18 self.access_key = None 

19 self.secret_key = None 

20 self.region = None 

21 self.client = None 

22 self.resource = None 

23 

24 def connect(self, access_key_=None, secret_key_=None, region_='us-east-2'): 

25 """ 

26 Creates S3 client and resource handles and returns to the following class attributes: 

27 client = self.client 

28 resource = self.resource 

29 :param access_key_: AWS Access Key. Default None. 

30 :param secret_key_: AWS Secret Key. Default None. 

31 :param region_: AWS Region. Default 'us-east-2'. 

32 :return: None. 

33 """ 

34 

35 self.region = region_ 

36 self.client = boto3.client( 

37 's3', region_name=self.region, aws_access_key_id=access_key_, aws_secret_access_key=secret_key_) 

38 self.resource = boto3.resource( 

39 's3', region_name=self.region, aws_access_key_id=access_key_, aws_secret_access_key=secret_key_) 

40 

41 def check_date_and_convert(self, datetime_, format_, timedelta_type_=None, timedelta_amount_=None): 

42 """ 

43 Takes a String or Datetime object and applies any time deltas then converts to desired datetime format. 

44 :param datetime_: String or Datetime object. 

45 :param format_: String. Formatting to apply when converting result to string (ex. "%Y-%m-%d"). 

46 :param timedelta_type_: String. Type of time delta to apply to datetime. Default = None. 

47 Options: 

48 - "days" 

49 - "hours" 

50 - "minutes" 

51 :param timedelta_amount_: Int. Amount to apply via time delta. Default = None. 

52 :return: String. Formatted datetime. 

53 """ 

54 

55 if type(datetime_) == str: 

56 datetime_ = parser.parse(datetime_).replace(tzinfo=None) 

57 if timedelta_type_: 

58 if timedelta_type_ == "days": 

59 datetime_ = datetime_ + datetime.timedelta(days=timedelta_amount_) 

60 elif timedelta_type_ == "hours": 

61 datetime_ = datetime_ + datetime.timedelta(hours=timedelta_amount_) 

62 elif timedelta_type_ == "minutes": 

63 datetime_ = datetime_ + datetime.timedelta(minutes=timedelta_amount_) 

64 else: 

65 raise NotImplementedError(f"Timedelta type argument \"{timedelta_type_}\" not currently supported.") 

66 return datetime_.strftime(format_) 

67 

68 def get_objects_with_file_filters(self, bucket_name_, prefix_, prefix_filter_, file_filters_, date_filter_, 

69 **kwargs): 

70 """ 

71 Creates a list of items in an S3 bucket using file filtering logic. 

72 :param bucket_name_: S3 bucket name. 

73 :param prefix_: Prefix of object in bucket. 

74 :param prefix_filter_: String. Filters based on if prefix_filter_ passed in here is present anywhere in object prefix. 

75 :param file_filters_: Dict. Options for filtering files in S3. Default = None. 

76 :param date_filter_: String/Datetime object. Datetime that will be used to compare a file datetime against. 

77 Default = None. 

78 :return: List of S3 file/object names as strings. 

79 """ 

80 

81 bucket = self.resource.Bucket(bucket_name_) 

82 available_objects = [] 

83 

84 for file_filter in file_filters_: 

85 if file_filter["type"] == "modified_date": 

86 # Set the allowed operators for date comparisons 

87 ops = {'>': operator.gt, '<': operator.lt, '>=': operator.ge, '<=': operator.le, '=': operator.eq} 

88 # Handle the comparison date given - add timedelta (if passed) and format 

89 comparison_date = self.check_date_and_convert( 

90 datetime_=date_filter_, 

91 format_=file_filter["opts"]["format"], 

92 timedelta_type_=file_filter.get("opts", {}).get("timedelta", {}).get("type"), 

93 timedelta_amount_=file_filter.get("opts", {}).get("timedelta", {}).get("amount")) 

94 if file_filter["opts"]["comparison"] not in ops: 

95 raise NotImplementedError("Filter comparison type passed not currently supported.") 

96 

97 logging.info("Attempting to locate all files that meet filter criteria") 

98 # For every file in the S3 bucket that matches the prefix given... 

99 for file in bucket.objects.filter(Prefix=prefix_): 

100 # The file date will be the modified date of the object 

101 file_date = self.check_date_and_convert(file.last_modified, file_filter["opts"]["format"]) 

102 # Apply the comparison operator given between the file date and the comparison date 

103 if ops[file_filter["opts"]["comparison"]](file_date, comparison_date): 

104 # If the comparison is True, then validate if prefix_filter_ is given. 

105 # If the object name matches the prefix_filter_. If yes, then include file in filtered results. 

106 # If no prefix_filter_ is given, then include file in filtered results. 

107 if not prefix_filter_ or (prefix_filter_ and prefix_filter_ in file.key): 

108 if kwargs["give_full_path_"]: 

109 available_objects.append(file.key) 

110 else: 

111 available_objects.append(file.key.replace(prefix_, "", 1)) 

112 else: 

113 raise NotImplementedError("File filter type is not currently supported.") 

114 

115 return available_objects 

116 

117 def get_obj_list(self, 

118 bucket_name_, 

119 prefix_, 

120 prefix_filter_, 

121 obj_regex_=None, 

122 give_full_path_=False, 

123 file_filters_=None, 

124 date_filter_=None): 

125 """ 

126 Creates a list of items in an S3 bucket. 

127 :param prefix_filter_: Filter a prefix 

128 :param bucket_name_: Name of S3 bucket to search 

129 :param prefix_: Prefix used to find files. 

130 :param obj_regex_: If used, will return all objects that match this 

131 regex pattern. Default None. 

132 :param give_full_path_: If False, only file name will be returned. If 

133 true, full path & file name will be returned. Default = False. 

134 :param file_filters_: Dict. Options for filtering files in S3. Default = None. 

135 :param date_filter_: String/Datetime object. Datetime that will be used to compare a file datetime against. 

136 Default = None. 

137 :return: List of S3 file/object names as strings. 

138 """ 

139 

140 def iterate_on_s3_response(response_, files_, **kwargs): 

141 for item in response_["Contents"]: 

142 if kwargs["prefix_"] in item["Key"] and kwargs["prefix_filter_"] in item["Key"]: 

143 if kwargs["give_full_path_"]: 

144 files_.append(item["Key"]) 

145 else: 

146 files_.append(os.path.basename(item["Key"])) 

147 

148 available_objects = [] 

149 object_results = [] 

150 

151 # If file_filters is given, we will construct a list of S3 files that meant those conditions. 

152 if file_filters_: 

153 available_objects = self.get_objects_with_file_filters( 

154 bucket_name_, prefix_, prefix_filter_, file_filters_, date_filter_, give_full_path_=give_full_path_) 

155 

156 # If no file_filters given, then we will grab all files that exist in the bucket/prefix path given 

157 else: 

158 response = self.client.list_objects_v2(Bucket=bucket_name_, Prefix=prefix_) 

159 

160 if "Contents" in response: 

161 prefix_filter_ = prefix_filter_ if prefix_filter_ else "" 

162 iterate_on_s3_response( 

163 response_=response, 

164 files_=available_objects, 

165 bucket_name_=bucket_name_, 

166 prefix_=prefix_, 

167 prefix_filter_=prefix_filter_, 

168 give_full_path_=give_full_path_) 

169 while response["IsTruncated"]: 

170 logging.info(response["NextContinuationToken"]) 

171 response = self.client.list_objects_v2( 

172 Bucket=bucket_name_, Prefix=prefix_, ContinuationToken=response["NextContinuationToken"]) 

173 iterate_on_s3_response( 

174 response_=response, 

175 files_=available_objects, 

176 bucket_name_=bucket_name_, 

177 prefix_=prefix_, 

178 prefix_filter_=prefix_filter_, 

179 give_full_path_=give_full_path_) 

180 

181 # If obj_regex_ is given, we will return only those files that meet the regex criteria 

182 if obj_regex_: 

183 object_results = \ 

184 [x for x in available_objects if re.search(obj_regex_, x)] 

185 

186 else: 

187 object_results = available_objects 

188 

189 return object_results 

190 

191 def get_obj_stats(self, bucket_name_, prefix_, obj_name_): 

192 """ 

193 Returns a dictionary of an objects stats, including size and last 

194 modified date. 

195 :param bucket_name_: Bucket name. 

196 :param prefix_: File prefix (i.e. S3 folder path) 

197 :param obj_name_: Name of object. 

198 :return: Dictionary of various object stats. 

199 """ 

200 

201 stats = {} 

202 

203 try: 

204 obj_header = self.client.head_object(Bucket=bucket_name_, Key=os.path.join(prefix_, obj_name_)) 

205 

206 stats["size_bytes"] = obj_header["ContentLength"] 

207 stats["size_mb"] = obj_header["ContentLength"] / 1048576 

208 stats["last_modified"] = obj_header["LastModified"] 

209 

210 except ClientError as e: 

211 logging.info(f"There was an error retrieving stats for {obj_name_}. {e} ") 

212 

213 return stats 

214 

215 def get_file_obj_record_count(self, file_obj_): 

216 """ 

217 Gets the total number of records contained in a file object. Returns file obj seek to 0 after record count. 

218 :param file_obj_: Object. File as an object from S3. 

219 :return: Int. Number of records in file object. 

220 """ 

221 

222 record_count = 0 

223 for line in file_obj_: 

224 record_count += 1 

225 file_obj_.seek(0) 

226 

227 return record_count 

228 

229 def get_file_as_file_object(self, 

230 bucket_name_, 

231 prefix_, 

232 file_name_, 

233 file_format_=None, 

234 decode_='utf-8', 

235 compression_type_=None): 

236 """ 

237 Creates a file object from a given S3 file. 

238 :param file_format_:String indicating binary or text format 

239 :param bucket_name_: String. Bucket name. 

240 :param prefix_: String. File prefix (i.e. S3 folder path). 

241 :param file_name_: String. Name to write object as in S3. 

242 :param decode_: String. The character encoding of the string. Ex: utf-8 

243 :param compression_type_: Compression type of S3 object. 

244 :return: File-like object of either StringIO or BytesIO type. 

245 """ 

246 file_obj = self.client.get_object(Bucket=bucket_name_, Key=os.path.join(prefix_, file_name_)) 

247 if compression_type_ == 'gz' or compression_type_ == 'gzip' or file_format_ == 'binary': 

248 logging.info("Getting S3 file object as BytesIO...") 

249 s_buf = io.BytesIO(file_obj["Body"].read()) 

250 elif file_format_ == 'base64': 

251 logging.info("Getting S3 file object as StringIO...") 

252 s_buf = io.BytesIO(base64.b64decode(file_obj["Body"].read())) 

253 else: 

254 logging.info("Getting S3 file object as StringIO...") 

255 s_buf = io.StringIO(file_obj["Body"].read().decode(decode_)) 

256 return s_buf 

257 

258 def write_file(self, bucket_name_, prefix_, file_name_, file_obj_, return_stats_=True): 

259 """ 

260 Takes a file object and writes out contents to file on S3. 

261 :param bucket_name_: Bucket name. 

262 :param prefix_: File prefix (i.e. S3 folder path). 

263 :param file_name_: Name to write object as in S3. 

264 :param file_obj_: File as an object to write to S3. 

265 :param return_stats_. Boolean. If True, checks item in S3 and returns size as confirmation of upload. 

266 :return: None. 

267 """ 

268 

269 try: 

270 self.client.put_object(Body=file_obj_.read(), Bucket=bucket_name_, Key=os.path.join(prefix_, file_name_)) 

271 logging.info("Object written to S3.") 

272 

273 except ClientError as e: 

274 logging.info(f"There was an error {e} while uploading {file_name_} " f"to S3") 

275 

276 if return_stats_: 

277 stats = self.get_obj_stats(bucket_name_, prefix_, file_name_) 

278 result = f'{round(stats["size_mb"], 5):,} MB' 

279 logging.info(result) 

280 return result 

281 else: 

282 return None 

283 

284 def upload_file(self, 

285 bucket_name_, 

286 prefix_, 

287 local_object_path_, 

288 local_object_name_, 

289 in_bucket_object_name_=None, 

290 region_="us-east-2"): 

291 """ 

292 Upload a file to an S3 bucket. 

293 

294 :param bucket_name_: String. Bucket name to upload file to. 

295 :param prefix_: String. Prefix in bucket to upload file to. 

296 :param local_object_path_: String. Local path to file ot upload. 

297 :param local_object_name_: String. Local name of file to upload. 

298 :param in_bucket_object_name_: String. Name to write file to in bucket. If None, file name will be 

299 name of file from local_object_path_. 

300 :param region_: AWS Region. Default 'us-east-2'. 

301 :return: String. Target path if successful upload. 

302 """ 

303 

304 local_file = os.path.join(local_object_path_, local_object_name_) 

305 object_name_ = in_bucket_object_name_ or local_object_name_ 

306 object_key_ = os.path.join(prefix_, object_name_) 

307 

308 try: 

309 self.client.upload_file(Filename=local_file, Bucket=bucket_name_, Key=object_key_) 

310 stats = self.get_obj_stats(bucket_name_=bucket_name_, prefix_=prefix_, obj_name_=object_name_) 

311 if stats: 

312 logging.info(f'File {object_name_} written to bucket. Size: {round(stats["size_mb"], 5):,} MB') 

313 target_path = f"https://{bucket_name_}.s3.{region_}.amazonaws.com/{object_key_}" 

314 return target_path 

315 

316 except ClientError as e: 

317 logging.info(f"There was an error while uploading {in_bucket_object_name_} to S3. Error: {e}") 

318 return None 

319 

320 def __get_bucket_resource(self, bucket_name_, region_="us-east-2"): 

321 """ 

322 Creates and returns a bucket resource. 

323 :param bucket_name_: String. Bucket name to get resource for. 

324 :param region_: AWS Region. Default 'us-east-2'. 

325 :return: boto3 bucket resource. 

326 """ 

327 

328 bucket = self.resource.Bucket(bucket_name_) 

329 return bucket 

330 

331 def __create_bucket(self, bucket_name_, region_="us-east-2"): 

332 """ 

333 Creates an S3 bucket. 

334 :param bucket_name_: String. Bucket name to create. 

335 :param region_: AWS Region. Default 'us-east-2'. 

336 :return: 

337 """ 

338 

339 bucket = self.resource.create_bucket( 

340 Bucket=bucket_name_, CreateBucketConfiguration={'LocationConstraint': region_}) 

341 return bucket 

342 

343 def delete_object(self, bucket_name_, prefix_=None, object_name_=None): 

344 """ 

345 Deletes an object from S3. 

346 

347 :param bucket_name_: String. Bucket name to delete file from. 

348 :param prefix_: String. Prefix in bucket to delete file from. 

349 :param object_name_: String. Name of object to delete. 

350 :return: 

351 """ 

352 

353 object_key = os.path.join(prefix_, object_name_) 

354 

355 response = self.client.delete_object(Bucket=bucket_name_, Key=object_key) 

356 return response 

357 

358 def set_expiration_policy(self, bucket_name_, expiration_in_days_=1): 

359 """ 

360 Sets an expiration policy for a bucket. 

361 

362 :param bucket_name_: String. Bucket name to upload file to. 

363 :param expiration_in_days_: Int. Expiration days. 

364 :return: Dict. Response from AWS. 

365 """ 

366 

367 logging.info(f"Attempting to set {expiration_in_days_:,} day expiration policy for bucket {bucket_name_}.") 

368 

369 try: 

370 response = self.client.put_bucket_lifecycle_configuration( 

371 Bucket=bucket_name_, 

372 LifecycleConfiguration={ 

373 'Rules': [{ 

374 'Expiration': { 

375 'Days': expiration_in_days_ 

376 }, 

377 'ID': f'delete-after-{expiration_in_days_}-days', 

378 'Prefix': '', 

379 'Status': 'Enabled' 

380 },] 

381 }) 

382 

383 if response and response["ResponseMetadata"]["HTTPStatusCode"] != 200: 

384 raise Exception( 

385 f"Update to {bucket_name_}'s life cycle policy did not return a 200 response:\n{response}") 

386 else: 

387 logging.info("Life cycle policy set/updated.") 

388 

389 return response 

390 

391 except Exception as e: 

392 logging.info(f"Failed to update {bucket_name_} life cycle policy due to:\n{e}") 

393 

394 def write_bytes(self, data_bytes_, object_name_, bucket_name_=None, prefix_='', region_="us-east-2"): 

395 """ 

396 Writes bytes out to an S3 bucket. 

397 

398 :param data_bytes_:. Bytes. Data to write out. 

399 :param bucket_name_: String. Bucket name to write bytes to. 

400 :param prefix_: String. Prefix in bucket to write bytes to. 

401 :param object_name_: String. Name of object to write bytes as. 

402 :param region_: AWS Region. Default 'us-east-2'. 

403 :return: Dict of write results: 

404 { 

405 'target_url': target_url, 

406 'target_arn': target_arn, 

407 'bucket': put_response.bucket_name, 

408 'key': put_response.key 

409 } 

410 """ 

411 

412 if not isinstance(data_bytes_, bytes): 

413 raise ValueError(f'data_bytes_ must be of type "Bytes", not "{type(data_bytes_).__name__}".') 

414 

415 object_path_ = os.path.join(prefix_, object_name_) 

416 

417 bucket = self.__get_bucket_resource(bucket_name_, region_) 

418 bucket = bucket if bucket.creation_date else self.__create_bucket(bucket_name_) 

419 

420 try: 

421 put_response = bucket.put_object(Key=object_path_, Body=data_bytes_) 

422 except ClientError as e: 

423 logging.info(f'There was an error while uploading {object_name_} to S3. Error: {e}') 

424 return BaseException(e) 

425 

426 target_url = f'https://{put_response.bucket_name}.s3.{region_}.amazonaws.com/{put_response.key}' 

427 target_arn = f'arn:s3://{put_response.bucket_name}/{put_response.key}' 

428 return { 

429 'target_url': target_url, 

430 'target_arn': target_arn, 

431 'bucket': put_response.bucket_name, 

432 'key': put_response.key 

433 } 

434 

435 def move_object(self, src_bucket_name_, src_prefix_, obj_name_, new_obj_name_, dest_bucket_name_, dest_prefix_): 

436 """ 

437 Moves an object from a source s3 bucket and prefix to another. 

438 :param src_bucket_name_: String. Source Bucket name. 

439 :param src_prefix_: String. Source File prefix (i.e. S3 folder path). 

440 :param obj_name_: String. Name to read object as in S3. 

441 :param new_obj_name_: String. Name to write object as in S3. 

442 :param dest_bucket_name_: String. Destination Bucket name. 

443 :param dest_prefix_: String. Destination File prefix (i.e. S3 folder path). 

444 :return: response including operation status. 

445 """ 

446 try: 

447 src_key = os.path.join(src_prefix_, obj_name_) 

448 dest_key = os.path.join(dest_prefix_, new_obj_name_) 

449 copy_source = {'Bucket': src_bucket_name_, 'Key': src_key} 

450 self.client.copy(copy_source, dest_bucket_name_, dest_key) 

451 response = self.client.delete_object(Bucket=src_bucket_name_, Key=src_key) 

452 return response 

453 except Exception as e: 

454 logging.info(f"There was an error while moving object {obj_name_}. {e}") 

455 

456 def copy_object(self, 

457 src_bucket_name_, 

458 src_prefix_, 

459 obj_name_, 

460 dest_bucket_name_, 

461 dest_prefix_, 

462 dest_obj_name_=None): 

463 """ 

464 Copy an object from a source s3 bucket and prefix to another. 

465 :param src_bucket_name_: String. Source Bucket name. 

466 :param src_prefix_: String. Source File prefix (i.e. S3 folder path). 

467 :param obj_name_: String. Name to write object as in S3. 

468 :param dest_bucket_name_: String. Destination Bucket name. 

469 :param dest_prefix_: String. Destination File prefix (i.e. S3 folder path). 

470 :param dest_obj_name_: String. Destination file name. If None the source file name is taken 

471 :return: response including operation status. 

472 """ 

473 try: 

474 src_key = os.path.join(src_prefix_, obj_name_) 

475 dest_key = os.path.join(dest_prefix_, dest_obj_name_ if dest_obj_name_ is not None else obj_name_) 

476 copy_source = {'Bucket': src_bucket_name_, 'Key': src_key} 

477 response = self.client.copy(copy_source, dest_bucket_name_, dest_key) 

478 return response 

479 except Exception as e: 

480 logging.info(f"There was an error while copying object {obj_name_}. {e}")