Files
boilerplates/python/aws/s3/getS3Info.py

157 lines
4.7 KiB
Python

import boto3
import logging
import os
import datetime
from splunk_hec_handler import SplunkHecHandler
import warnings
import customLogLevel
from socket import gaierror
class getFileDetails(object):
"""Query an S3 bucket for information about files stored
Required arguments:
bucket: a string with the name of the bucket
remoteFiles: a string containing the path and filename of the files
client: a boto3 s3 client
resource: a boto3 s3 resource
Optional arguments:
logger: a splunk HEC handler for the logging module
"""
def __init__(self,
bucket: str,
remoteFiles: str,
client: boto3.client,
resource: boto3.resource,
logger: logging.getLogger = None
):
super().__init__()
self.bucket = bucket
self.remoteFiles = remoteFiles
self.client = client
self.resource = resource
self.logger = logger
@staticmethod
def formatFolder(remoteFolder: str):
try:
if remoteFolder[-1] != '/':
remoteFolder = f'{remoteFolder}/'
except IndexError:
remoteFolder = ''
return remoteFolder
@staticmethod
def generateDate(date_format='%Y-%m-%d',
time: datetime.datetime = None
):
"""
Generates a human readable time string for a
datetime.datetime object.
By default will use today with %Y-%m-%d format """
if time is None:
time = datetime.date.today()
date = time.strftime(date_format)
return date
@staticmethod
def getEpochTime(time: datetime.datetime):
epoch = datetime.datetime.timestamp(time)
return epoch
def getS3Files(self):
self.paginator = self.client.get_paginator('list_objects')
self.iterator = self.paginator.paginate(Bucket=self.bucket,
Prefix=self.remoteFiles)
self.filtered = self.iterator.search('Contents[*]')
self.fileDict = dict()
counter = 0
for i in self.filtered:
now = self.getEpochTime(datetime.datetime.now())
i.update({'_time': now})
i['LastModified'] = self.getEpochTime(i['LastModified'])
self.fileDict[counter] = i
counter += 1
return self
def sendToSplunk(self):
for i in self.fileDict.values():
try:
self.logger.splunk(i)
except AttributeError:
raise Exception("No logger level exists for the custom Splunk"
" level. Try creating a customLogLevel for"
" splunk and try again.")
return self
def _call():
# Define env variables for AWS boto3
os.environ['AWS_PROFILE'] = 'netacea'
os.environ['AWS_DEFAULT_REGION'] = 'eu-west-1'
# Define bucket and file names
bucket = 'td-ingest-storage-williamhill'
remoteFolder = 'bot_predictions/'
remoteFilePrefix = 'blocking_suggestions_'
append_date = True
date_format = '%Y-%m-%d'
# Define splunk hec handler for logging
try:
splunk_handler = SplunkHecHandler('sc1uxpremn81.prod.williamhill.plc',
'ea641e31-870e-4f5f-965f'
'-57e0c9a2aa3d',
port=8088, proto='https',
ssl_verify=False,
sourcetype='httpevent')
except gaierror:
raise SystemExit
logger = logging.getLogger('SplunkHecHandlerExample')
logger.setLevel(logging.DEBUG)
logger.addHandler(splunk_handler)
customLogLevel.addLoggingLevel('splunk', 15, logging)
main(bucket,
remoteFolder,
remoteFilePrefix,
append_date,
date_format,
logger)
def main(bucket: str,
remoteFolder: str,
remoteFilePrefix: str,
append_date: bool,
date_format: str,
logger: logging.getLogger):
if append_date:
remoteFileDate = getFileDetails.generateDate(date_format)
else:
remoteFileDate = ''
remoteFolder = getFileDetails.formatFolder(remoteFolder)
remoteFiles = f'{remoteFolder}{remoteFilePrefix}{remoteFileDate}'
client = boto3.client('s3')
resource = boto3.resource('s3')
instance = getFileDetails(bucket,
remoteFiles,
client,
resource,
logger)
instance.getS3Files().sendToSplunk()
if __name__ == '__main__':
warnings.filterwarnings("ignore")
_call()