adding initial boilerplates

This commit is contained in:
2019-11-29 02:29:44 +00:00
parent 79cc65825a
commit 88e3a5bb6e
26 changed files with 1136 additions and 2 deletions

View File

@@ -0,0 +1,257 @@
import boto3
import botocore
import os
from datetime import date, timedelta
import sys
import logging
import argparse
import glob
# Set Global Variables
log_location = 'pull.log'
remote_folder = ['bot_predictions/']
remote_file_prefix = ['blocking_suggestions_']
append_date = ['True']
date_format = ['%Y-%m-%d']
bucket = ['td-ingest-storage-williamhill']
access_key = ['AKIAYJXVWMRHQ2OGNHLA']
secret_key = ['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']
class downloadFiles(object):
"""docstring for downloadFiles"""
today = date.today()
yesterday = date.today() - timedelta(1)
def __init__(self,
client,
resource,
bucket,
remote_folder,
remote_file_prefix,
local_path,
append_date=False,
date_format=''):
super(downloadFiles, self).__init__()
self.client = client
self.resource = resource
self.bucket = bucket
self.append_date = append_date
self.date_format = date_format
self.remote_folder = self._folder_fixer(remote_folder)
self.dest = f'{self.remote_folder!s}{remote_file_prefix!s}'
self.local_path = local_path
self.remote_list, self.local_list, self.local_file_list = \
(list() for _ in range(3))
@staticmethod
def generate_date(date_format, relative_day='today'):
if relative_day == 'today':
date = downloadFiles.today.strftime(date_format)
elif relative_day == 'yesterday':
date = downloadFiles.yesterday.strftime(date_format)
return date
@staticmethod
def _folder_fixer(folder):
try:
if folder[-1] != '/':
folder = f'{folder}/'
except IndexError:
folder = ''
return folder
def get_path(self):
if self.local_path:
self.local_path = self._folder_fixer(self.local_path)
logger.info(f'path entered is {self.local_path}')
return self
else:
self.local_path = os.getcwd()
self.local_path = self._folder_fixer(self.local_path)
self.local_path = f'{self.local_path}blocking_suggestions/'
logger.info(f'no path entered, using current directory '
f'{self.local_path}')
return self
def get_files(self):
if self.append_date:
date_today = self.generate_date(self.date_format)
date_yesterday = self.generate_date(self.date_format, 'yesterday')
else:
date_today = ''
date_yesterday = ''
self.dest_list = []
self.dest_list.append(f'{self.dest!s}{date_today!s}')
self.dest_list.append(f'{self.dest!s}{date_yesterday!s}')
for dest in self.dest_list:
paginator = self.client.get_paginator('list_objects')
iterator = paginator.paginate(Bucket=self.bucket, Prefix=dest)
self.filtered = iterator.search('Contents[*].Key')
for i in self.filtered:
try:
self.remote_list.append(i)
self.local_list.append(
f'{self.local_path}{i[len(self.remote_folder):]}'
)
self.local_file_list.append(
f'{i[len(self.remote_folder):]}'
)
except TypeError:
logger.info('no files available to download -- exiting')
raise SystemExit
logger.debug(f'remote files are {self.remote_list}')
logger.debug(f'saving files locally to {self.local_list}')
return self
def get_history(self):
self.history_file = f'{self.local_path}.history.txt'
try:
logger.info('opening history file')
open(self.history_file, 'a').close()
pass
except FileNotFoundError:
logger.critical('history file cannot be found or created'
' - check permissions of the folder.')
raise
self.history_list = \
[line.rstrip('\n') for line in open(self.history_file)]
return self
def remove_files(self):
logger.info('attempting to clear current files')
current_files = glob.glob(f'{self.local_path}[!history.txt]*')
if current_files:
for i in current_files:
try:
os.remove(i)
logger.info(f'removed {i}')
except OSError:
logger.exception('Error:')
else:
logger.info('no files to remove')
return self
def download_files(self):
for remote_file, local_file_with_path, local_file in zip(
self.remote_list, self.local_list, self.local_file_list):
if local_file not in self.history_list:
with open(local_file_with_path, 'wb'), \
open(self.history_file, 'a') as hist:
try:
self.resource.Bucket(self.bucket).download_file(
remote_file, local_file_with_path)
hist.write(f'\n{local_file}')
logger.info(f'downloaded {local_file}')
except botocore.exceptions.ClientError as e:
if e.response['Error']['Code'] == '404':
print(f'The object {remote_file} does not exist.')
else:
raise
if local_file in self.history_list:
logger.debug(f'{local_file} already downloaded - skipping')
return self
def _call():
global args, debug
parser = argparse.ArgumentParser(description="""
downloads any new files for the current day from an S3 bucket. \
uses a local history file to track what has been \
previously downloaded in the download path.
""")
parser.add_argument('--path', type=str,
help='enter pull path to download to. if left \
blank will use the same location as the script.',
default='')
parser.add_argument('--debug', action='store_true', default=False,
help='Use this to log DEBUG information.')
args = parser.parse_args()
debug = vars(args)['debug']
if debug:
logger.setLevel(logging.DEBUG)
else:
logger.setLevel(logging.INFO)
main(_clients=_clients,
_resources=_resources,
_buckets=_buckets,
remote_folder=remote_folder,
remote_file_prefix=remote_file_prefix,
append_date=append_date,
date_format=date_format,
**vars(args))
def main(*args,
_clients={'client0': ''},
_resources={'resource0': ''},
_buckets={'bucket0': ''},
remote_folder=[''],
remote_file_prefix=[''],
append_date=['True'],
date_format=[''],
path='',
**kwargs):
logger.info('========= SCRIPT STARTED =========')
instance = downloadFiles(client=_clients['client0'],
resource=_resources['resource0'],
bucket=_buckets['bucket0'],
remote_folder=remote_folder[0],
remote_file_prefix=remote_file_prefix[0],
local_path=path,
append_date=append_date[0],
date_format=date_format[0])
instance.get_path().get_files().get_history().remove_files()\
.download_files()
logger.info('========= SCRIPT FINISHED =========')
if __name__ == '__main__':
args, debug = '', ''
# define logging
logger = logging.getLogger(__name__)
c_handler = logging.StreamHandler(sys.stdout)
f_handler = logging.FileHandler(log_location)
c_format = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
f_format = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
c_handler.setFormatter(c_format)
f_handler.setFormatter(f_format)
logger.addHandler(c_handler)
logger.addHandler(f_handler)
_clients = {}
_resources = {}
_buckets = {}
for i in range(0, len(bucket)):
_clients[f'client{i}'] =\
boto3.client('s3',
aws_access_key_id=f'{access_key[i]}',
aws_secret_access_key=f'{secret_key[i]}')
_resources[f'resource{i}'] =\
boto3.resource('s3',
aws_access_key_id=f'{access_key[i]}',
aws_secret_access_key=f'{secret_key[i]}')
_buckets[f'bucket{i}'] = f'{bucket[i]}'
try:
_length = len(remote_folder)
if _length == 0:
remote_folder = ['']
elif remote_folder[0] == 'root':
remote_folder = ['']
else:
pass
except NameError:
remote_folder = ['']
_call()

156
python/aws/s3/getS3Info.py Normal file
View File

@@ -0,0 +1,156 @@
import boto3
import logging
import os
import datetime
from splunk_hec_handler import SplunkHecHandler
import warnings
import customLogLevel
from socket import gaierror
class getFileDetails(object):
"""Query an S3 bucket for information about files stored
Required arguments:
bucket: a string with the name of the bucket
remoteFiles: a string containing the path and filename of the files
client: a boto3 s3 client
resource: a boto3 s3 resource
Optional arguments:
logger: a splunk HEC handler for the logging module
"""
def __init__(self,
bucket: str,
remoteFiles: str,
client: boto3.client,
resource: boto3.resource,
logger: logging.getLogger = None
):
super().__init__()
self.bucket = bucket
self.remoteFiles = remoteFiles
self.client = client
self.resource = resource
self.logger = logger
@staticmethod
def formatFolder(remoteFolder: str):
try:
if remoteFolder[-1] != '/':
remoteFolder = f'{remoteFolder}/'
except IndexError:
remoteFolder = ''
return remoteFolder
@staticmethod
def generateDate(date_format='%Y-%m-%d',
time: datetime.datetime = None
):
"""
Generates a human readable time string for a
datetime.datetime object.
By default will use today with %Y-%m-%d format """
if time is None:
time = datetime.date.today()
date = time.strftime(date_format)
return date
@staticmethod
def getEpochTime(time: datetime.datetime):
epoch = datetime.datetime.timestamp(time)
return epoch
def getS3Files(self):
self.paginator = self.client.get_paginator('list_objects')
self.iterator = self.paginator.paginate(Bucket=self.bucket,
Prefix=self.remoteFiles)
self.filtered = self.iterator.search('Contents[*]')
self.fileDict = dict()
counter = 0
for i in self.filtered:
now = self.getEpochTime(datetime.datetime.now())
i.update({'_time': now})
i['LastModified'] = self.getEpochTime(i['LastModified'])
self.fileDict[counter] = i
counter += 1
return self
def sendToSplunk(self):
for i in self.fileDict.values():
try:
self.logger.splunk(i)
except AttributeError:
raise Exception("No logger level exists for the custom Splunk"
" level. Try creating a customLogLevel for"
" splunk and try again.")
return self
def _call():
# Define env variables for AWS boto3
os.environ['AWS_PROFILE'] = 'netacea'
os.environ['AWS_DEFAULT_REGION'] = 'eu-west-1'
# Define bucket and file names
bucket = 'td-ingest-storage-williamhill'
remoteFolder = 'bot_predictions/'
remoteFilePrefix = 'blocking_suggestions_'
append_date = True
date_format = '%Y-%m-%d'
# Define splunk hec handler for logging
try:
splunk_handler = SplunkHecHandler('sc1uxpremn81.prod.williamhill.plc',
'ea641e31-870e-4f5f-965f'
'-57e0c9a2aa3d',
port=8088, proto='https',
ssl_verify=False,
sourcetype='httpevent')
except gaierror:
raise SystemExit
logger = logging.getLogger('SplunkHecHandlerExample')
logger.setLevel(logging.DEBUG)
logger.addHandler(splunk_handler)
customLogLevel.addLoggingLevel('splunk', 15, logging)
main(bucket,
remoteFolder,
remoteFilePrefix,
append_date,
date_format,
logger)
def main(bucket: str,
remoteFolder: str,
remoteFilePrefix: str,
append_date: bool,
date_format: str,
logger: logging.getLogger):
if append_date:
remoteFileDate = getFileDetails.generateDate(date_format)
else:
remoteFileDate = ''
remoteFolder = getFileDetails.formatFolder(remoteFolder)
remoteFiles = f'{remoteFolder}{remoteFilePrefix}{remoteFileDate}'
client = boto3.client('s3')
resource = boto3.resource('s3')
instance = getFileDetails(bucket,
remoteFiles,
client,
resource,
logger)
instance.getS3Files().sendToSplunk()
if __name__ == '__main__':
warnings.filterwarnings("ignore")
_call()

View File

@@ -0,0 +1,141 @@
import boto3
import base64
import math
import json
from datetime import datetime
class receiveFromSQS(object):
"""docstring for receiveFromSQS"""
def __init__(self, session, queueURL):
super(receiveFromSQS, self).__init__()
self.session = session
self.sqs = session.client('sqs')
self.queueURL = queueURL
self.messages = []
@classmethod
def createSession(cls, profileName, queueURL):
session = boto3.Session(profile_name=profileName)
return cls(session, queueURL)
def getQueueLength(self):
attributeNames = ['ApproximateNumberOfMessages']
self.queueAttributes = self.sqs.get_queue_attributes(
QueueUrl=self.queueURL, AttributeNames=attributeNames
)
self.queueLength = int(
self.queueAttributes['Attributes']['ApproximateNumberOfMessages']
)
return self.queueLength
def _receiveSQSMessage(
self, totalNumberOfMessages, maxNumberOfMessages=10
):
self.resp = []
self.loops = int(
math.ceil(totalNumberOfMessages / maxNumberOfMessages)
)
loopTrack = 0
if totalNumberOfMessages <= 10:
maxNumberOfMessages = totalNumberOfMessages
else:
maxNumberOfMessagesFinal = 10 - (
(self.loops * maxNumberOfMessages) - totalNumberOfMessages
)
print(maxNumberOfMessagesFinal)
if self.loops == 0:
raise RuntimeError('No messages in the queue')
for i in range(0, self.loops):
if loopTrack == self.loops - 1 and totalNumberOfMessages > 10:
maxNumberOfMessages = maxNumberOfMessagesFinal
self.resp.append(
self.sqs.receive_message(
QueueUrl=self.queueURL,
MaxNumberOfMessages=maxNumberOfMessages,
)
)
try:
entries = [
{
'Id': msg['MessageId'],
'ReceiptHandle': msg['ReceiptHandle'],
}
for msg in self.resp[i]['Messages']
]
self._deleteSQSMessages(entries)
loopTrack += 1
except KeyError:
print("No messages in the queue")
return self
def _extractMessageFromSQS(self, totalNumberOfMessages):
self.extractedMessages = []
self.receiptHandles = []
try:
for i in range(0, self.loops):
_loops = len(self.resp[i]['Messages'])
for j in range(0, _loops):
if 'Messages' in self.resp[i]:
self.extractedMessages.append(
self.resp[i]['Messages'][j]['Body']
)
else:
print('No messages in the queue')
except KeyError:
print('No messages in the queue key')
return self
def _deleteSQSMessages(self, entries):
self.respDelete = self.sqs.delete_message_batch(
QueueUrl=self.queueURL, Entries=entries
)
if len(self.respDelete['Successful']) != len(entries):
raise RuntimeError(
f'Failed to delete messages: entries={entries!r}'
f' resp={self.respDelete!r}'
)
def _decodeMessages(self):
if len(self.extractedMessages) == 0:
print('No messages to process')
else:
for message in self.extractedMessages:
decoded = base64.b64decode(message).decode()
self.messages.append(decoded)
return self
def receiveAllMessages(self, b64=True, _totalNumberOfMessages=None):
if _totalNumberOfMessages is None:
totalNumberOfMessages = self.getQueueLength()
else:
totalNumberOfMessages = _totalNumberOfMessages
self._receiveSQSMessage(totalNumberOfMessages)
self._extractMessageFromSQS(
totalNumberOfMessages=totalNumberOfMessages
)
if b64:
self._decodeMessages()
else:
self.messages = self.extractedMessages
return self
def receiveNMessages(self, numberOfMessages, b64=True):
self.receiveAllMessages(
b64=b64, _totalNumberOfMessages=numberOfMessages
)
return self
# def generateOutput(self, outputType='json'):
# if outputType == 'json':
# self.output = json.dumps(self.messages)
# return self.output
def savetoDisk(self, path):
self.timeNow = datetime.now().strftime('%d-%m-%Y_%H:%M:%S')
if len(self.messages) > 0:
with open(f'{path}/{self.timeNow}.json', 'w+') as outputFile:
json.dump(self.messages, outputFile)
else:
print('No messages to save')

View File

@@ -0,0 +1,63 @@
import boto3
import base64
import hashlib
from datetime import datetime
import os
import sys
sys.path.append(os.getcwd())
from pullTrafficInfo import getTrafficInfo
class sendToSQS(object):
"""docstring for sendToSQS"""
def __init__(self, session, queueURL):
super(sendToSQS, self).__init__()
self.session = session
self.sqs = session.client('sqs')
self.queueURL = queueURL
@classmethod
def createSession(cls, profileName, queueURL):
session = boto3.Session(profile_name=profileName)
return cls(session, queueURL)
def sendMessage(self, message, messageGroupId, b64=True, dedup=False):
currentTime = datetime.now().strftime('%H:%M:%S.%f')
if b64:
message = (base64.b64encode(message.encode())).decode()
if not dedup:
dedupId = hashlib.md5((message + currentTime).encode()).hexdigest()
msg = self.sqs.send_message(
QueueUrl=self.queueURL,
MessageBody=message,
MessageGroupId=messageGroupId,
MessageDeduplicationId=dedupId,
)
else:
msg = self.sqs.send_message(
QueueUrl=self.queueURL,
MessageBody=message,
MessageGroupId=messageGroupId,
)
if msg is not None:
print(msg['MessageId'])
# inst = sendToSQS.createSession(
# profileName='plex-aws',
# queueURL='https://sqs.eu-west-1.amazonaws.com'
# '/745437999005/slack-bot.fifo',
# )
# instM = (
# getTrafficInfo.getTrafficURL('M62')
# .findIncidents()
# .getIncidentInformation()
# .generateOutput()
# )
# for _ in range(0, 5):
# for item in instM.output:
# inst.sendMessage(message=item, messageGroupId='slack-bot-M62')