diff --git a/src/lambda_function/raw_sync/app.py b/src/lambda_function/raw_sync/app.py index aa01232..a8679e0 100644 --- a/src/lambda_function/raw_sync/app.py +++ b/src/lambda_function/raw_sync/app.py @@ -176,7 +176,7 @@ def match_corresponding_raw_object( data_type: str, cohort: str, expected_key: str, - raw_keys: list[dict], + raw_keys: defaultdict, ) -> Optional[str]: """ Find a matching raw object for a given export file and filename. @@ -483,6 +483,29 @@ def get_data_type_from_path(path: str) -> str: return data_type +def get_expected_raw_key(namespace: str, data_type: str, cohort: str, path: str) -> str: + """Get the expected raw S3 key + + Get the expected raw S3 key of a raw bucket object corresponding to the given + input bucket object. + + Args: + namespace (str): The namespace of the corresponding input object. + data_type (str): The data type of the corresponding input object. + cohort (str): The cohort of the corresponding input object. + path (str): The path of the file relative to the zip archive (export). + + Returns: + str: The expected S3 key of the corresponding raw object. + """ + file_identifier = os.path.basename(path).split(".")[0] + expected_key = ( + f"{namespace}/json/dataset={data_type}" + f"/cohort={cohort}/{file_identifier}.ndjson.gz" + ) + return expected_key + + def main( event: dict, s3_client: boto3.client, @@ -519,22 +542,23 @@ def main( f"from s3://{input_bucket}/{export_key}" ) data_type = get_data_type_from_path(path=filename) - file_identifier = filename.split(".")[0] - expected_key = ( - f"{namespace}/json/dataset={data_type}" - f"/cohort={cohort}/{file_identifier}.ndjson.gz" + expected_raw_key = get_expected_raw_key( + namespace=namespace, + data_type=data_type, + cohort=cohort, + path=filename, ) corresponding_raw_object = match_corresponding_raw_object( data_type=data_type, cohort=cohort, - expected_key=expected_key, + expected_key=expected_raw_key, raw_keys=raw_keys, ) if corresponding_raw_object is None: logger.info( f"Did not find corresponding raw object for {filename} from " f"s3://{input_bucket}/{export_key} at " - f"s3://{raw_bucket}/{expected_key}" + f"s3://{raw_bucket}/{expected_raw_key}" ) publish_to_sns( bucket=input_bucket, diff --git a/tests/test_lambda_raw_sync.py b/tests/test_lambda_raw_sync.py index 58daa7d..960e62e 100644 --- a/tests/test_lambda_raw_sync.py +++ b/tests/test_lambda_raw_sync.py @@ -633,3 +633,33 @@ def test_get_data_type_from_path_deleted(): path = "path/to/HealthKitV2Samples_AppleStandTime_Deleted_20241111-20241112.json" data_type = app.get_data_type_from_path(path=path) assert data_type == "HealthKitV2Samples_Deleted" + + +import os + + +def test_get_expected_raw_key_case1(): + namespace = "test-namespace" + data_type = "test-data-type" + cohort = "test-cohort" + path = "path/to/FitbitIntradayCombined_20241111-20241112.json" + expected_key = f"{namespace}/json/dataset={data_type}/cohort={cohort}/FitbitIntradayCombined_20241111-20241112.ndjson.gz" + assert app.get_expected_raw_key(namespace, data_type, cohort, path) == expected_key + + +def test_get_expected_raw_key_case2(): + namespace = "test-namespace" + data_type = "test-data-type" + cohort = "test-cohort" + path = "path/to/HealthKitV2Samples_AppleStandTime_20241111-20241112.json" + expected_key = f"{namespace}/json/dataset={data_type}/cohort={cohort}/HealthKitV2Samples_AppleStandTime_20241111-20241112.ndjson.gz" + assert app.get_expected_raw_key(namespace, data_type, cohort, path) == expected_key + + +def test_get_expected_raw_key_case3(): + namespace = "test-namespace" + data_type = "test-data-type" + cohort = "test-cohort" + path = "path/to/HealthKitV2Samples_AppleStandTime_Deleted_20241111-20241112.json" + expected_key = f"{namespace}/json/dataset={data_type}/cohort={cohort}/HealthKitV2Samples_AppleStandTime_Deleted_20241111-20241112.ndjson.gz" + assert app.get_expected_raw_key(namespace, data_type, cohort, path) == expected_key