Skip to content

Commit

Permalink
(#112) Save search metadata as a separate document.
Browse files Browse the repository at this point in the history
Closes #112
  • Loading branch information
blairlearn committed Jan 28, 2025
1 parent 5621eaa commit 9805285
Show file tree
Hide file tree
Showing 5 changed files with 99 additions and 49 deletions.
20 changes: 18 additions & 2 deletions migration/migrate.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,9 +40,12 @@

BUCKET = os.environ["CTS_BUCKET_NAME"]

# The linter is being stupid. These are running totals, not constants.
# pylint: disable=invalid-name
loadedCount = 0
errorCount = 0
totalCount = 0
# pylint: enable=invalid-name
with open(sys.argv[1], encoding="utf-8-sig") as datafile:
csv.field_size_limit(sys.maxsize)
reader = csv.reader(datafile)
Expand All @@ -55,11 +58,15 @@
searchParams = row[3]
content = row[4]

metadataKey = f"{key}-metadata"

# Convert trials into the string representation of an array,
trialList = '["' + '","'.join(trialIDs.split(',')) + '"]'
searchMetadata = f"{{\"trial_ids\": {trialList}, \"search_criteria\": {searchParams} }}"

metadata = {}
metadata["migrated-data"] = "True"
metadata["originally-generated"] = cacheDate
metadata["search-criteria"] = searchParams
metadata["trial-id-list"] = trialIDs

print(key)
try:
Expand All @@ -70,6 +77,13 @@
Body=bytearray(content, "utf-8"),
ContentType="text/html",
)
S3_CLIENT.put_object(
Key=metadataKey,
Bucket=BUCKET,
Metadata=metadata,
Body=bytearray(searchMetadata, "utf-8"),
ContentType="application/json",
)
loadedCount += 1

# Handle AWS-related errors.
Expand All @@ -82,9 +96,11 @@
raise RuntimeError("\n\n\n\tFatal error - Expired token.\n\n") from err

# Non-AWS errors
# pylint: disable=broad-exception-caught
except Exception as err:
errorCount += 1
print(err)
# pylint: enable=broad-exception-caught

finally:
totalCount += 1
Expand Down
1 change: 1 addition & 0 deletions migration/sample-data/large-metadata.csv

Large diffs are not rendered by default.

59 changes: 41 additions & 18 deletions src/CancerGov.CTS.Print/DataManager/PrintCacheManager.cs
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Net;
using System.Threading.Tasks;

using Amazon.S3;
using Amazon.S3.Model;
using Common.Logging;

using CancerGov.CTS.Print.Models;
using System.IO;

namespace CancerGov.CTS.Print.DataManager
{
public class PrintCacheManager
Expand Down Expand Up @@ -37,16 +37,15 @@ public PrintCacheManager(IAmazonS3 client, string bucketName)
/// Saves the generated page and its associated metadata in the S3 bucket associated
/// with this instance
/// </summary>
/// <param name="key"></param>
/// <param name="trialIds"></param>
/// <param name="searchParams"></param>
/// <param name="content"></param>
public async Task Save(Guid key, string[] trialIds, SearchCriteria searchParams, string content)
/// <param name="key">A unique identifier for later retrieval of the document.</param>
/// <param name="metadata">JSON string containing a list of trial IDs and the search criteria.</param>
/// <param name="content">The document body.</param>
public async Task Save(Guid key, string metadata, string content)
{
try
{
// Create a PutObject request
PutObjectRequest request = new PutObjectRequest
PutObjectRequest documentPutRequest = new PutObjectRequest
{
BucketName = BucketName,
Key = key.ToString(),
Expand All @@ -55,24 +54,48 @@ public async Task Save(Guid key, string[] trialIds, SearchCriteria searchParams,
};

// Metadata in case we ever need to search for print pages meeting some criteria.
request.Metadata.Add(METADATA_TRIAL_ID_LIST_KEY, String.Join(",", trialIds));
request.Metadata.Add(METADATA_SEARCH_CRITERIA_KEY, searchParams.ToJson());
// Can't save this as object metadata because AWS limits user metadata to 2K and
// a sufficiently complicated advanced search can exceed that.
// https://docs.aws.amazon.com/AmazonS3/latest/userguide/UsingMetadata.html
string metadataKey = $"{key.ToString()}-metadata";
PutObjectRequest metadataPutRequest = new PutObjectRequest
{
BucketName = BucketName,
Key = metadataKey,
ContentBody = metadata,
ContentType = "application/json"
};

// Put object.
PutObjectResponse response = await this.Client.PutObjectAsync(request);
// Do both PUT requests together to save time.
// In theory, one of these requests could succeed and other fail and throw an error,
// but for now we'll assume either both succeed or both throw.
PutObjectRequest[] requests = { documentPutRequest, metadataPutRequest };
PutObjectResponse[] responses = await Task.WhenAll(
requests.Select(async individual =>
await this.Client.PutObjectAsync(individual)
)
);

PutObjectResponse documentResponse = responses[0];
PutObjectResponse metadataResponse = responses[1];

// Log "unusual" status codes, but attempt to keep going.
int status = (int)response.HttpStatusCode;
if(!(status >= 200 && status < 300))
int documentStatus = (int)documentResponse.HttpStatusCode;
int metadataStatus = (int)metadataResponse.HttpStatusCode;
if(!(documentStatus >= 200 && documentStatus < 300 && metadataStatus >= 200 && metadataStatus < 300))
{
log.Warn($"Unexpected return code '{status}' ({response.HttpStatusCode}) saving document '{key}'.");
if (documentStatus >= 300 )
log.Warn($"Unexpected return code '{documentStatus}' ({documentResponse.HttpStatusCode}) saving document '{key}'.");
if (metadataStatus >= 300)
log.Warn($"Unexpected return code '{metadataStatus}' ({metadataResponse.HttpStatusCode}) saving document '{key}' metadata.");


// Something's definitely wrong with this save attempt.
// Log an error and throw.
// See "List of error codes" on https://docs.aws.amazon.com/AmazonS3/latest/API/ErrorResponses.html#ErrorCodeList
if (status >= 400)
if (documentStatus >= 400)
{
string message = $"Error return code '{status}' ({response.HttpStatusCode}) saving document '{key}'.";
string message = $"Error return code '{documentStatus}' ({documentResponse.HttpStatusCode}) saving document '{key}'.";
log.Error(message);
throw new PrintSaveFailureException(message);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -308,9 +308,12 @@ public async override Task ProcessRequestAsync(HttpContext context)
string referenceURL = String.Format(PrintPageDisplayURLFormat, key);
page = page.Replace("${generatePrintURL}", referenceURL);

// Save the results.
// Use the entire request body as metadata.
string rawRequestData = requestBody.ToString(Formatting.None);

// Save the page.
var datamgr = GetPrintCacheManager();
await datamgr.Save(key, trialIDs, criteria, page);
await datamgr.Save(key, rawRequestData, page);

// Set up the return.
statusCode = 200;
Expand Down
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
using System;
using System.Collections.Generic;
using System.Net;
using System.Threading;
using System.Threading.Tasks;

using Amazon.S3;
using Amazon.S3.Model;
using CancerGov.CTS.Print.DataManager;
using CancerGov.CTS.Print.Models;
using Moq;
using Xunit;

Expand Down Expand Up @@ -34,15 +34,16 @@ public async Task SaveFailure(HttpStatusCode statusCode)

const string key = "ca761232-ed42-11ce-bacd-00aa0057b223";

SearchCriteria criteria = SearchCriteriaFactory.Create(null);
// Exact contents don't matter, we just need something.
string mockMetadata = "{\"trial_ids\": [ \"trial-id-1\", \"trial-id-2\"],\"search_criteria\": null}";

var cm = new PrintCacheManager(mockClientConfig.Object, "fake-bucket-name");

PrintSaveFailureException ex = await Assert.ThrowsAsync<PrintSaveFailureException>
(
async () =>
{
await cm.Save(new Guid(key), new string[] { "trial-id-1", "trial-id-2" }, criteria, "blob of HTML");
await cm.Save(new Guid(key), mockMetadata, "blob of HTML");
}
);
Assert.Equal(
Expand All @@ -64,44 +65,50 @@ public async Task ObjectStructure(HttpStatusCode status)
{
Mock<IAmazonS3> mockClientConfig = new Mock<IAmazonS3>();

PutObjectRequest actualRequest = null;

PutObjectResponse mockResponse = new PutObjectResponse
{
HttpStatusCode = status
};

const string expectedKey = "ca761232-ed42-11ce-bacd-00aa0057b223";
const string mockBucketName = "fake-bucket-name";

const string contentKey = "ca761232-ed42-11ce-bacd-00aa0057b223";
string metadataKey = $"{contentKey}-metadata"; // Can't put this as a const in C# 7.3.

// The exact strings don't matter, we just need values.
const string expectedMetadata = "\"trial_ids\": [\"trial-id-1\",\"trial-id-2\"], \"search_criteria\":{\"trialTypes\":[{\"label\":\"Supportive Care\",\"value\":\"supportive_care\",\"checked\":true}],\"location\": \"search-location-country\",\"country\": \"Canada\"}";
const string expectedBody = "<html><body><p>Blob of HTML</p></body></html>";

// Need to accomodate AWS prepending "x-amz-meta-" to the metadata field names.
string trial_id_key = String.Format("x-amz-meta-{0}", PrintCacheManager.METADATA_TRIAL_ID_LIST_KEY);
string search_criteria_key = String.Format("x-amz-meta-{0}", PrintCacheManager.METADATA_SEARCH_CRITERIA_KEY);
const string expectedTrialList = "trial-id-1,trial-id-2";
const string expectedSearchCriteria = "[{\"Label\":\"Country\",\"Value\":\"Canada\"},{\"Label\":\"Supportive Care\",\"Value\":\"supportive_care\"}]";

mockClientConfig.Setup(svc => svc.PutObjectAsync(It.IsAny<PutObjectRequest>(), It.IsAny<CancellationToken>()))
.Callback((PutObjectRequest request, CancellationToken tok) => actualRequest = request)
.ReturnsAsync(mockResponse);

SearchCriteria criteria = SearchCriteriaFactory.Create(null);
criteria.Add("Country", "Canada");
criteria.Add("Supportive Care", "supportive_care");

var cm = new PrintCacheManager(mockClientConfig.Object, "fake-bucket-name");

await cm.Save(new Guid(expectedKey), new string[] { "trial-id-1", "trial-id-2" }, criteria, expectedBody);
var cm = new PrintCacheManager(mockClientConfig.Object, mockBucketName);

Assert.NotNull(actualRequest);
Assert.Equal(expectedKey, actualRequest.Key);
Assert.Equal(expectedBody, actualRequest.ContentBody);
await cm.Save(new Guid(contentKey), expectedMetadata, expectedBody);

Assert.NotEmpty(actualRequest.Metadata.Keys);
Assert.Contains(trial_id_key, actualRequest.Metadata.Keys);
Assert.Contains(search_criteria_key, actualRequest.Metadata.Keys);
// Verify a request was made to save the metadata.
mockClientConfig.Verify(svc => svc.PutObjectAsync(
It.Is<PutObjectRequest>(req =>
req.BucketName == mockBucketName
&& req.Key == metadataKey
&& req.ContentBody == expectedMetadata
&& req.ContentType == "application/json"
),
It.IsAny<CancellationToken>()
), Times.Once()
);

Assert.Equal(expectedTrialList, actualRequest.Metadata[trial_id_key]);
Assert.Equal(expectedSearchCriteria, actualRequest.Metadata[search_criteria_key]);
mockClientConfig.Verify(svc => svc.PutObjectAsync(
It.Is<PutObjectRequest>(req =>
req.BucketName == mockBucketName
&& req.Key == contentKey
&& req.ContentBody == expectedBody
&& req.ContentType == "text/html"
),
It.IsAny<CancellationToken>()
), Times.Once()
);

}
}
Expand Down

0 comments on commit 9805285

Please sign in to comment.