From 60c9e26eb8ffa05daeb2f49d36c5014c0ef7ed45 Mon Sep 17 00:00:00 2001
From: Pranav Anbarasu <pranavanba@gmail.com>
Date: Thu, 29 Feb 2024 17:21:49 +0000
Subject: [PATCH] Update max_rows_per_file param in arrow::write_dataset()
 operation

---
 filtering.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/filtering.R b/filtering.R
index 306a8a5..760dbfe 100644
--- a/filtering.R
+++ b/filtering.R
@@ -48,7 +48,7 @@ drop_cols_datasets <- function(dataset, columns=c(), input = AWS_PARQUET_DOWNLOA
     arrow::open_dataset(sources = input_path) %>% 
       dplyr::select(!dplyr::any_of(columns)) %>% 
       arrow::write_dataset(path = final_path, 
-                           max_rows_per_file = 100000,
+                           max_rows_per_file = 1000000,
                            partitioning = partitions, 
                            existing_data_behavior = 'delete_matching',
                            basename_template = paste0("part-0000{i}.", as.character("parquet")))