From 6a819138d9c86dbc970c6802d2083827af19f910 Mon Sep 17 00:00:00 2001 From: Pranav Anbarasu Date: Tue, 24 Oct 2023 20:57:24 +0000 Subject: [PATCH 1/3] Add provenance arguments to synStore() --- sts_synindex_external.R | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/sts_synindex_external.R b/sts_synindex_external.R index b8f1e80..6a956cb 100644 --- a/sts_synindex_external.R +++ b/sts_synindex_external.R @@ -214,21 +214,21 @@ if(nrow(synapse_manifest_to_upload) > 0){ tmp <- synapse_manifest_to_upload[file_number, c("path", "parent", "s3_file_key")] absolute_file_path <- tools::file_path_as_absolute(tmp$path) - + temp_syn_obj <- synapser::synCreateExternalS3FileHandle( bucket_name = PARQUET_BUCKET_EXTERNAL, s3_file_key = tmp$s3_file_key, file_path = absolute_file_path, parent = tmp$parent) - + new_fileName <- stringr::str_replace_all(temp_syn_obj$fileName, ':', '_colon_') - - f <- - synStore( - File(dataFileHandleId = temp_syn_obj$id, - parentId = tmp$parent, - name = new_fileName)) - + + f <- File(dataFileHandleId = temp_syn_obj$id, + parentId = tmp$parent, + name = new_fileName) + + f <- synStore(f, activity = "Indexing", activityDescription = "Indexing external parquet datasets", used = PARQUET_FOLDER_INTERNAL, executed = "") + } } From 1de7e95c738f14faa35fc2a9c37bda19c12c6ef0 Mon Sep 17 00:00:00 2001 From: Pranav Anbarasu Date: Tue, 24 Oct 2023 21:07:11 +0000 Subject: [PATCH 2/3] Use url of repo at latest commit as "used" argument in synStore() for provenance --- sts_synindex_external.R | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/sts_synindex_external.R b/sts_synindex_external.R index 6a956cb..631271a 100644 --- a/sts_synindex_external.R +++ b/sts_synindex_external.R @@ -209,6 +209,9 @@ synapse_manifest_to_upload <- s3_file_key = gsub("cohort_", "cohort=", s3_file_key)) # Index each file in Synapse +latest_commit <- gh::gh("/repos/:owner/:repo/commits/main", owner = "Sage-Bionetworks", repo = "recover-parquet-external") +latest_commit_tree_url <- latest_commit$html_url %>% stringr::str_replace("commit", "tree") + if(nrow(synapse_manifest_to_upload) > 0){ for(file_number in seq_len(nrow(synapse_manifest_to_upload))){ tmp <- synapse_manifest_to_upload[file_number, c("path", "parent", "s3_file_key")] @@ -228,7 +231,7 @@ if(nrow(synapse_manifest_to_upload) > 0){ parentId = tmp$parent, name = new_fileName) - f <- synStore(f, activity = "Indexing", activityDescription = "Indexing external parquet datasets", used = PARQUET_FOLDER_INTERNAL, executed = "") + f <- synStore(f, activity = "Indexing", activityDescription = "Indexing external parquet datasets", used = PARQUET_FOLDER_INTERNAL, executed = latest_commit_tree_url) } } From f97f3792b1c6ebe08dcda8b552354b7d620082bc Mon Sep 17 00:00:00 2001 From: Pranav Anbarasu Date: Tue, 24 Oct 2023 21:08:28 +0000 Subject: [PATCH 3/3] Organize code --- sts_synindex_external.R | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/sts_synindex_external.R b/sts_synindex_external.R index 631271a..a43b618 100644 --- a/sts_synindex_external.R +++ b/sts_synindex_external.R @@ -231,7 +231,11 @@ if(nrow(synapse_manifest_to_upload) > 0){ parentId = tmp$parent, name = new_fileName) - f <- synStore(f, activity = "Indexing", activityDescription = "Indexing external parquet datasets", used = PARQUET_FOLDER_INTERNAL, executed = latest_commit_tree_url) + f <- synStore(f, + activity = "Indexing", + activityDescription = "Indexing external parquet datasets", + used = PARQUET_FOLDER_INTERNAL, + executed = latest_commit_tree_url) } }