Skip to content

Commit

Permalink
Merge pull request #18 from pranavanba/main
Browse files Browse the repository at this point in the history
Handle unknown formatting of dictionary file values in deidentification step
  • Loading branch information
pranavanba authored Feb 1, 2024
2 parents 776be48 + 560f318 commit b35e9eb
Showing 1 changed file with 7 additions and 4 deletions.
11 changes: 7 additions & 4 deletions deidentification.R
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,10 @@ junk <- lapply(list.files("./dictionaries/", full.names = T), function(f) {
lines <- readLines(f)

modified_lines <- lapply(lines, function(line) {
if (!grepl("^\".*\",", line)) {
line <- gsub("^(.*),", '"\\1",', line)
line <- gsub('"', '', line)
if (grepl(",APPROVED|,UNAPPROVED", line)) {
# line <- gsub("^(.*?)(,APPROVED|,approved|,UNAPPROVED|,unapproved)", '"\\1"\\2', line)
line <- gsub('(.*?)"?(,APPROVED|,approved|,UNAPPROVED|,unapproved)', '"\\1"\\2', line)
}
return(line)
})
Expand Down Expand Up @@ -101,9 +103,10 @@ for (i in seq_along(deidentified_results$deidentified_datasets)) {

arrow::write_dataset(dataset = deidentified_results$deidentified_datasets[[i]],
path = file.path(PARQUET_FINAL_LOCATION, names(deidentified_results$deidentified_datasets)[[i]]),
max_rows_per_file = 100000,
max_rows_per_file = 1000000,
partitioning = c('cohort'),
existing_data_behavior = 'delete_matching')
existing_data_behavior = 'delete_matching',
basename_template = paste0("part-0000{i}.", as.character("parquet")))
}


Expand Down

0 comments on commit b35e9eb

Please sign in to comment.