-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
(#102) Create script to migrate data to S3 bucket.
Expects CSV file as exported by MS SQL Server Management Studio. (See https://github.com/NCIOCPL/clinical-trials-search-print/wiki/Deployment#data-migration) Closes #102
- Loading branch information
1 parent
63bde79
commit 830f0d0
Showing
5 changed files
with
128 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
format: | ||
python -m isort . | ||
python -m black . | ||
|
||
lint: | ||
python -m isort . -c | ||
python -m black . --check | ||
pylint . --recursive=y | ||
|
||
test: | ||
coverage run -m unittest discover -s py_acli | ||
coverage report | ||
coverage run -m unittest discover -s automations/helpers | ||
coverage report | ||
|
||
all: | ||
make format | ||
make lint | ||
make test |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,94 @@ | ||
#!/usr/bin/env python3 | ||
""" | ||
Script to store clinical trial print pages in an S3 bucket. | ||
The datafile (supplied as the only command-line parameter) is made up of | ||
multiple records, one record per line. Fields in the individual records | ||
which may contain commas are surrounded by double-quotes, with any double- | ||
quotes in the field's value replace by a pair of double-quotes. | ||
(This is really just the output of the MS SQL Server Managent Studio's | ||
"Save as CSV" on a query.) | ||
Fields in the record are: | ||
key - Unique string (a GUID) for identifying the record. | ||
cacheDate - Timestamp identifying when the record was originally generated. | ||
trialIDs - Comma-separated list of trial IDs. | ||
searchParams - A string containing the JSON search criteria. | ||
content - A blob of HTML representing the actual page. | ||
""" | ||
|
||
import csv | ||
import os | ||
import sys | ||
|
||
import boto3 | ||
from botocore.exceptions import ClientError | ||
|
||
S3_CLIENT = boto3.client("s3") | ||
|
||
if len(sys.argv) != 2: | ||
raise RuntimeError("Please provide the input file name as a command line argument.") | ||
|
||
|
||
if ( | ||
"CTS_BUCKET_NAME" not in os.environ | ||
or os.environ["CTS_BUCKET_NAME"] is None | ||
or os.environ["CTS_BUCKET_NAME"].strip() == "" | ||
): | ||
raise RuntimeError("The 'CTS_BUCKET_NAME' environment variable has not been set.") | ||
|
||
BUCKET = os.environ["CTS_BUCKET_NAME"] | ||
|
||
loadedCount = 0 | ||
errorCount = 0 | ||
totalCount = 0 | ||
with open(sys.argv[1], encoding="utf-8-sig") as datafile: | ||
csv.field_size_limit(sys.maxsize) | ||
reader = csv.reader(datafile) | ||
|
||
for row in reader: | ||
|
||
key = row[0] | ||
cacheDate = row[1] | ||
trialIDs = row[2] | ||
searchParams = row[3] | ||
content = row[4] | ||
|
||
metadata = {} | ||
metadata["migrated-data"] = "True" | ||
metadata["originally-generated"] = cacheDate | ||
metadata["search-criteria"] = searchParams | ||
metadata["trial-id-list"] = trialIDs | ||
|
||
print(key) | ||
try: | ||
S3_CLIENT.put_object( | ||
Key=key, | ||
Bucket=BUCKET, | ||
Metadata=metadata, | ||
Body=bytearray(content, "utf-8"), | ||
ContentType="text/html", | ||
) | ||
loadedCount += 1 | ||
|
||
# Handle AWS-related errors. | ||
except ClientError as err: | ||
errorCount += 1 | ||
print(err) | ||
|
||
# Bail completely for expired token. | ||
if err.response["Error"]["Code"] == "ExpiredToken": | ||
raise RuntimeError("\n\n\n\tFatal error - Expired token.\n\n") from err | ||
|
||
# Non-AWS errors | ||
except Exception as err: | ||
errorCount += 1 | ||
print(err) | ||
|
||
finally: | ||
totalCount += 1 | ||
|
||
print( | ||
f"Processed {totalCount} docments: Loaded {loadedCount} with {errorCount} errors." | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
# Production | ||
boto3~=1.35.10 | ||
|
||
# Development | ||
black==24.8.0 | ||
isort==5.13.2 | ||
pylint==3.2.7 |
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
AB561B48-B860-EF11-A20E-005056B388D3,2024-08-22 14:56:48.177,"NCI-2019-04727,NCI-2019-01528,NCI-2017-00902,NCI-2021-07529,NCI-2015-01548,NCI-2016-01047,NCI-2021-03303,NCI-2018-01807,NCI-2020-11651,NCI-2020-07169","[{""Label"":""Trial Type"",""Value"":""Supportive Care, Health Services Research, Basic Science""}]"," HTML would go here." |