forked from rh-aiservices-bu/object-detection-rest
-
Notifications
You must be signed in to change notification settings - Fork 6
/
data_ingestion.py
57 lines (41 loc) · 1.71 KB
/
data_ingestion.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
from os import environ, listdir, path, unlink
from shutil import rmtree
from boto3 import client
def ingest_data(bucket_name='', data_folder='./data'):
_clean_folder(data_folder)
print('Commencing data ingestion.')
s3_endpoint_url = environ.get('AWS_S3_ENDPOINT')
s3_access_key = environ.get('AWS_ACCESS_KEY_ID')
s3_secret_key = environ.get('AWS_SECRET_ACCESS_KEY')
s3_bucket_name = environ.get('AWS_S3_BUCKET')
print(f'Downloading data from bucket "{s3_bucket_name}" '
f'from S3 storage at {s3_endpoint_url}')
s3_client = client(
's3', endpoint_url=s3_endpoint_url,
aws_access_key_id=s3_access_key, aws_secret_access_key=s3_secret_key
)
paginator = s3_client.get_paginator('list_objects_v2')
pages = paginator.paginate(Bucket=s3_bucket_name)
for page in pages:
for obj in page.get('Contents', []):
key = obj['Key']
if key.endswith('.jpg'):
local_file_path = path.join(data_folder, key.split('/')[-1])
print(f'Downloading {key} to {local_file_path}')
s3_client.download_file(
s3_bucket_name, key, local_file_path
)
print('Finished data ingestion.')
def _clean_folder(folder):
print(f'Cleaning folder {folder}')
for filename in listdir(folder):
file_path = path.join(folder, filename)
try:
if path.isfile(file_path) or path.islink(file_path):
unlink(file_path)
elif path.isdir(file_path):
rmtree(file_path)
except Exception as e:
print(f'Failed to delete {file_path}. Reason: {e}')
if __name__ == '__main__':
ingest_data(data_folder='/data')