Skip to content

Commit

Permalink
(#102) Create script to migrate data to S3 bucket.
Browse files Browse the repository at this point in the history
Expects CSV file as exported by MS SQL Server Management Studio.
(See https://github.com/NCIOCPL/clinical-trials-search-print/wiki/Deployment#data-migration)

Closes #102
  • Loading branch information
blairlearn committed Nov 12, 2024
1 parent 63bde79 commit 830f0d0
Show file tree
Hide file tree
Showing 5 changed files with 128 additions and 0 deletions.
19 changes: 19 additions & 0 deletions migration/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
format:
python -m isort .
python -m black .

lint:
python -m isort . -c
python -m black . --check
pylint . --recursive=y

test:
coverage run -m unittest discover -s py_acli
coverage report
coverage run -m unittest discover -s automations/helpers
coverage report

all:
make format
make lint
make test
94 changes: 94 additions & 0 deletions migration/migrate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
#!/usr/bin/env python3
"""
Script to store clinical trial print pages in an S3 bucket.
The datafile (supplied as the only command-line parameter) is made up of
multiple records, one record per line. Fields in the individual records
which may contain commas are surrounded by double-quotes, with any double-
quotes in the field's value replace by a pair of double-quotes.
(This is really just the output of the MS SQL Server Managent Studio's
"Save as CSV" on a query.)
Fields in the record are:
key - Unique string (a GUID) for identifying the record.
cacheDate - Timestamp identifying when the record was originally generated.
trialIDs - Comma-separated list of trial IDs.
searchParams - A string containing the JSON search criteria.
content - A blob of HTML representing the actual page.
"""

import csv
import os
import sys

import boto3
from botocore.exceptions import ClientError

S3_CLIENT = boto3.client("s3")

if len(sys.argv) != 2:
raise RuntimeError("Please provide the input file name as a command line argument.")


if (
"CTS_BUCKET_NAME" not in os.environ
or os.environ["CTS_BUCKET_NAME"] is None
or os.environ["CTS_BUCKET_NAME"].strip() == ""
):
raise RuntimeError("The 'CTS_BUCKET_NAME' environment variable has not been set.")

BUCKET = os.environ["CTS_BUCKET_NAME"]

loadedCount = 0
errorCount = 0
totalCount = 0
with open(sys.argv[1], encoding="utf-8-sig") as datafile:
csv.field_size_limit(sys.maxsize)
reader = csv.reader(datafile)

for row in reader:

key = row[0]
cacheDate = row[1]
trialIDs = row[2]
searchParams = row[3]
content = row[4]

metadata = {}
metadata["migrated-data"] = "True"
metadata["originally-generated"] = cacheDate
metadata["search-criteria"] = searchParams
metadata["trial-id-list"] = trialIDs

print(key)
try:
S3_CLIENT.put_object(
Key=key,
Bucket=BUCKET,
Metadata=metadata,
Body=bytearray(content, "utf-8"),
ContentType="text/html",
)
loadedCount += 1

# Handle AWS-related errors.
except ClientError as err:
errorCount += 1
print(err)

# Bail completely for expired token.
if err.response["Error"]["Code"] == "ExpiredToken":
raise RuntimeError("\n\n\n\tFatal error - Expired token.\n\n") from err

# Non-AWS errors
except Exception as err:
errorCount += 1
print(err)

finally:
totalCount += 1

print(
f"Processed {totalCount} docments: Loaded {loadedCount} with {errorCount} errors."
)
7 changes: 7 additions & 0 deletions migration/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# Production
boto3~=1.35.10

# Development
black==24.8.0
isort==5.13.2
pylint==3.2.7
7 changes: 7 additions & 0 deletions migration/sample-data/cts-print-export.csv

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions migration/sample-data/embedded-commas.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
AB561B48-B860-EF11-A20E-005056B388D3,2024-08-22 14:56:48.177,"NCI-2019-04727,NCI-2019-01528,NCI-2017-00902,NCI-2021-07529,NCI-2015-01548,NCI-2016-01047,NCI-2021-03303,NCI-2018-01807,NCI-2020-11651,NCI-2020-07169","[{""Label"":""Trial Type"",""Value"":""Supportive Care, Health Services Research, Basic Science""}]"," HTML would go here."

0 comments on commit 830f0d0

Please sign in to comment.