Skip to content

Commit

Permalink
Merge pull request #9 from pranavanba/main
Browse files Browse the repository at this point in the history
Fix filtering function calls and introduce minor updates
  • Loading branch information
pranavanba authored Nov 8, 2023
2 parents a00f950 + 1b2c2db commit 945ad1e
Show file tree
Hide file tree
Showing 3 changed files with 16 additions and 7 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -38,3 +38,4 @@ dictionaries
dev*
misc*
*temp*
*pilot*
14 changes: 11 additions & 3 deletions filtering.R
Original file line number Diff line number Diff line change
Expand Up @@ -50,13 +50,17 @@ drop_cols_datasets <- function(dataset, columns=c(), input = AWS_PARQUET_DOWNLOA
arrow::write_dataset(path = final_path,
max_rows_per_file = 100000,
partitioning = partitions,
existing_data_behavior = 'delete_matching')
existing_data_behavior = 'delete_matching',
basename_template = paste0("part-0000{i}.", as.character("parquet")))
}
}


# Filtering ---------------------------------------------------------------
dob2age("dataset_enrolledparticipants", "DateOfBirth")
dob2age(dataset = "dataset_enrolledparticipants",
column = "DateOfBirth",
input = AWS_PARQUET_DOWNLOAD_LOCATION,
partitions = "cohort")

unlink(PARQUET_FILTERED_LOCATION, recursive = T, force = T)

Expand All @@ -67,7 +71,11 @@ pii_to_drop <- synGet('syn52523394')$path %>% read.csv()
tmp <-
lapply(seq_len(nrow(pii_to_drop)), function(i) {
cat(i, "Dropping", pii_to_drop$column_to_be_dropped[[i]], "from", pii_to_drop$dataset[[i]], "\n")
drop_cols_datasets(dataset = pii_to_drop$dataset[[i]], columns = pii_to_drop$column_to_be_dropped[[i]])
drop_cols_datasets(dataset = pii_to_drop$dataset[[i]],
columns = pii_to_drop$column_to_be_dropped[[i]],
input = AWS_PARQUET_DOWNLOAD_LOCATION,
output = PARQUET_FILTERED_LOCATION,
partitions = "cohort")
})

rm(pii_to_drop)
8 changes: 4 additions & 4 deletions sts_synindex_external.R
Original file line number Diff line number Diff line change
Expand Up @@ -170,14 +170,14 @@ system(manifest_cmd)

# Index files in Synapse --------------------------------------------------
# Get a list of all files to upload and their synapse locations (parentId)
STR_LEN_PARQUET_FINAL_LOCATION <- stringr::str_length(PARQUET_FINAL_LOCATION)
STR_LEN_PARQUET_FINAL_LOCATION <- stringr::str_length(AWS_ARCHIVE_DOWNLOAD_LOCATION)

## List all local files present (from manifest)
synapse_manifest <-
read.csv('./current_manifest.tsv', sep = '\t', stringsAsFactors = F) %>%
dplyr::filter(!grepl('owner.txt', path)) %>%
dplyr::rowwise() %>%
dplyr::mutate(file_key = stringr::str_sub(string = path, start = STR_LEN_PARQUET_FINAL_LOCATION)) %>%
dplyr::mutate(file_key = stringr::str_sub(string = path, start = STR_LEN_PARQUET_FINAL_LOCATION+2)) %>%
dplyr::mutate(s3_file_key = paste0(PARQUET_BUCKET_BASE_KEY_ARCHIVE, file_key)) %>%
dplyr::mutate(md5_hash = as.character(tools::md5sum(path))) %>%
dplyr::ungroup()
Expand Down Expand Up @@ -232,8 +232,8 @@ if(nrow(synapse_manifest_to_upload) > 0){
name = new_fileName)

f <- synStore(f,
activity = "Indexing",
activityDescription = "Indexing external parquet datasets",
activityName = "Indexing",
activityDescription = "Indexing external parquet datasets",
used = PARQUET_FOLDER_INTERNAL,
executed = latest_commit_tree_url)

Expand Down

0 comments on commit 945ad1e

Please sign in to comment.