From 60c9e26eb8ffa05daeb2f49d36c5014c0ef7ed45 Mon Sep 17 00:00:00 2001 From: Pranav Anbarasu Date: Thu, 29 Feb 2024 17:21:49 +0000 Subject: [PATCH] Update max_rows_per_file param in arrow::write_dataset() operation --- filtering.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/filtering.R b/filtering.R index 306a8a5..760dbfe 100644 --- a/filtering.R +++ b/filtering.R @@ -48,7 +48,7 @@ drop_cols_datasets <- function(dataset, columns=c(), input = AWS_PARQUET_DOWNLOA arrow::open_dataset(sources = input_path) %>% dplyr::select(!dplyr::any_of(columns)) %>% arrow::write_dataset(path = final_path, - max_rows_per_file = 100000, + max_rows_per_file = 1000000, partitioning = partitions, existing_data_behavior = 'delete_matching', basename_template = paste0("part-0000{i}.", as.character("parquet")))