diff --git a/.nojekyll b/.nojekyll index 4285c40..83cd982 100644 --- a/.nojekyll +++ b/.nojekyll @@ -1 +1 @@ -0d0b08cd \ No newline at end of file +7c2b2d55 \ No newline at end of file diff --git a/benchmarking.html b/benchmarking.html index db4311a..3a436ab 100644 --- a/benchmarking.html +++ b/benchmarking.html @@ -562,21 +562,21 @@

Unit: milliseconds
                          expr      min       lq      mean   median       uq
-             robject_polars()  27.1281  30.1365  35.44288  32.5842  38.7489
-              robject_rbase() 216.0771 228.6539 237.63752 239.9291 248.2713
-              robject_dplyr()  32.2709  34.4209  46.88596  49.5437  58.3165
-                 robject_dt()  45.0771  46.3848  59.67866  57.2030  68.3662
-       robject_duckdb_dplyr() 376.9727 397.6940 403.66032 400.4190 411.9787
-         robject_duckdb_sql()  80.6400  86.6684  89.71994  90.3671  91.0371
- robject_duckdb_arrow_dplyr() 292.8251 305.6941 333.98948 319.2016 321.3135
+             robject_polars()  22.2125  24.2582  28.78548  24.4624  36.2487
+              robject_rbase() 220.6788 246.8106 247.88984 247.2107 257.8647
+              robject_dplyr()  31.4568  32.3749  43.88756  45.0114  54.9415
+                 robject_dt()  44.0361  46.0946  57.57272  54.2305  67.5367
+       robject_duckdb_dplyr() 321.1113 322.1483 332.46270 326.6855 344.4590
+         robject_duckdb_sql()  75.9102  78.3098  83.63402  82.7341  89.0440
+ robject_duckdb_arrow_dplyr() 245.3665 248.7998 269.75346 254.2880 255.9525
       max neval
-  48.6167     5
- 255.2562     5
-  59.8778     5
-  81.3622     5
- 431.2372     5
-  99.8871     5
- 430.9131     5
+ 36.7456 5 + 266.8844 5 + 55.6532 5 + 75.9657 5 + 347.9094 5 + 92.1720 5 + 344.3605 5

πŸ‘‰ Conclusion of this little benchmark using R objects already loaded in memory: the fastest to run are polars and dplyr followed closely by data.table. πŸ†πŸ†πŸ†
@@ -637,14 +637,14 @@

toc()
-
0.29 sec elapsed
+
0.263 sec elapsed
@@ -689,14 +689,14 @@

toc()
-
0.098 sec elapsed
+
0.084 sec elapsed
@@ -738,7 +738,7 @@

res_rbase <- csv_rbase() toc()
-
10.386 sec elapsed
+
9.506 sec elapsed
print(res_rbase)
@@ -796,7 +796,7 @@

res_dplyr <- csv_dplyr() toc()

-
0.787 sec elapsed
+
0.677 sec elapsed
print(res_dplyr)
@@ -854,7 +854,7 @@

res_arrow <- csv_arrow() toc()

-
0.419 sec elapsed
+
0.365 sec elapsed
print(res_arrow)
@@ -904,7 +904,7 @@

toc()
-
0.389 sec elapsed
+
0.358 sec elapsed

@@ -941,19 +941,19 @@

Unit: milliseconds
                           expr       min        lq       mean    median
-  polars (eager) from csv file  279.7458  291.2039  300.19328  298.5861
-   polars (lazy) from csv file   82.5635   84.0201   89.73178   85.1958
-        R base - from csv file 8758.9725 8811.8265 9101.71930 8954.2198
-         dplyr - from csv file  607.1868  610.0531  656.87030  645.0672
- dplyr (Acero) - from csv file  244.1663  247.5951  256.30072  253.8460
-    data.table - from csv file  259.7881  263.5397  383.01002  335.6116
+  polars (eager) from csv file  259.0630  260.2271  273.28366  268.1899
+   polars (lazy) from csv file   79.8473   83.4839   87.61572   84.9361
+        R base - from csv file 7954.4349 8392.2098 8445.26252 8438.6395
+         dplyr - from csv file  523.7279  553.9853  594.50394  567.0172
+ dplyr (Acero) - from csv file  209.4744  210.2422  216.63242  218.0538
+    data.table - from csv file  262.3306  263.1075  351.57228  324.2653
         uq       max neval
-  308.3921  323.0385     5
-   88.3450  108.5345     5
- 9076.5466 9907.0311     5
-  707.6205  714.4239     5
-  266.2006  269.6956     5
-  431.9201  624.1906     5
+ 284.1632 294.7751 5 + 86.7033 103.1080 5 + 8601.4964 8839.5320 5 + 656.8072 670.9821 5 + 222.2487 223.1430 5 + 402.7321 505.4259 5

πŸ‘‰ Conclusion of this little benchmark based on csv files: the big winners are polars (eager mode) and dplyr with {arrow}. The results will undoubtedly be even better with polars (lazy mode)… πŸ†πŸ†πŸ†
@@ -1017,17 +1017,17 @@

parquet_polars_lazy()$collect()$to_data_frame()
  colString min_colInt mean_colInt max_colInt   min_colNum mean_colNum
-1         B       2001    5004.311       7999 3.385660e-05   0.5005457
-2         A       2001    4998.625       7999 3.794138e-05   0.4984446
-3         C       2001    5001.243       7999 3.045052e-05   0.5014723
+1         A       2001    4998.625       7999 3.794138e-05   0.4984446
+2         C       2001    5001.243       7999 3.045052e-05   0.5014723
+3         B       2001    5004.311       7999 3.385660e-05   0.5005457
   max_colNum
-1  0.9999863
-2  0.9999879
-3  0.9999921
+1 0.9999879 +2 0.9999921 +3 0.9999863
toc()
-
0.046 sec elapsed
+
0.043 sec elapsed
@@ -1072,7 +1072,7 @@

toc()
-
0.142 sec elapsed
+
0.131 sec elapsed
@@ -1117,7 +1117,7 @@

toc()
-
0.208 sec elapsed
+
0.156 sec elapsed
@@ -1163,7 +1163,7 @@

toc()
-
0.095 sec elapsed
+
0.09 sec elapsed
@@ -1182,16 +1182,16 @@

print(unique_parquet_bmk)
Unit: milliseconds
-                                      expr      min       lq     mean   median
-  polars (lazy) - from unique parquet file  38.7847  39.6757  44.1609  41.6334
-  arrow (eager) - from unique parquet file 104.4476 108.6151 116.7358 109.0899
-   arrow (lazy) - from unique parquet file 143.5093 144.0727 150.9942 145.7879
- Duckdb and SQL - from unique parquet file  89.8875  90.6194  92.6642  93.4881
+                                      expr      min       lq      mean   median
+  polars (lazy) - from unique parquet file  38.8066  39.0678  43.52496  39.6750
+  arrow (eager) - from unique parquet file  96.0868  97.3807 111.94214 109.0585
+   arrow (lazy) - from unique parquet file 116.7953 118.9454 128.50826 128.2780
+ Duckdb and SQL - from unique parquet file  85.2885  87.5746  88.20928  88.3278
        uq      max neval
-  42.0520  58.6587     5
- 128.3451 133.1814     5
- 154.7482 166.8528     5
-  93.9643  95.3617     5
+ 40.0819 59.9935 5 + 118.3866 138.7981 5 + 134.7467 143.7759 5 + 88.9761 90.8794 5

πŸ‘‰ Conclusion of this little benchmark based on unique parquet files: the big winner is polars (lazy mode) ! πŸ†πŸ†πŸ†

@@ -1260,13 +1260,13 @@

toc()
-
0.201 sec elapsed
+
0.157 sec elapsed
@@ -1316,7 +1316,7 @@

toc()
-
0.526 sec elapsed
+
0.403 sec elapsed
@@ -1357,14 +1357,14 @@

toc()
-
0.042 sec elapsed
+
0.041 sec elapsed
@@ -1383,13 +1383,13 @@

Unit: milliseconds
                                            expr      min       lq      mean
-   arrow (lazy) - from partitioned parquet file 149.9898 161.4222 166.98852
- dplyr (duckdb) - from partitioned parquet file 463.1010 473.2102 481.29544
-  polars (lazy) - from partitioned parquet file  38.9680  39.6177  44.81158
+   arrow (lazy) - from partitioned parquet file 124.9216 125.9604 131.99446
+ dplyr (duckdb) - from partitioned parquet file 346.2294 359.3282 364.48044
+  polars (lazy) - from partitioned parquet file  40.0859  42.1322  44.76884
    median       uq      max neval
- 163.7782 179.0058 180.7466     5
- 474.0380 491.8206 504.3074     5
-  39.6533  39.9143  65.9046     5
+ 130.9570 135.9574 142.1759 5 + 360.2117 362.5724 394.0605 5 + 42.9752 43.5088 55.1421 5

πŸ‘‰ Conclusion of this little benchmark based on partitioned parquet files: as for unique parquet files, the big winner is polars (lazy mode) ! πŸ†πŸ†πŸ†

@@ -1440,12 +1440,12 @@

  colString min_colInt mean_colInt max_colInt   min_colNum mean_colNum
 1         B       2001    5004.311       7999 3.385660e-05   0.5005457
-2         A       2001    4998.625       7999 3.794138e-05   0.4984446
-3         C       2001    5001.243       7999 3.045052e-05   0.5014723
+2         C       2001    5001.243       7999 3.045052e-05   0.5014723
+3         A       2001    4998.625       7999 3.794138e-05   0.4984446
   max_colNum
 1  0.9999863
-2  0.9999879
-3  0.9999921
+2 0.9999921 +3 0.9999879
toc()
@@ -1465,8 +1465,8 @@

duckdb_bmk

Unit: milliseconds
-                 expr     min      lq     mean  median     uq     max neval
- SQL from duckdb file 79.5339 81.5291 84.39356 84.5486 85.172 91.1842     5
+ expr min lq mean median uq max neval + SQL from duckdb file 73.6214 75.591 78.45332 77.7204 81.3255 84.0083 5

Note that the query with the standard DBI methods is faster than those with dplyr verbs πŸ†

diff --git a/benchmarking_files/figure-html/final-results-plot-1.png b/benchmarking_files/figure-html/final-results-plot-1.png index e33acfa..da6b2af 100644 Binary files a/benchmarking_files/figure-html/final-results-plot-1.png and b/benchmarking_files/figure-html/final-results-plot-1.png differ diff --git a/data_manipulation.html b/data_manipulation.html index f183e9b..21d2721 100644 --- a/data_manipulation.html +++ b/data_manipulation.html @@ -977,12 +977,12 @@

)
Unit: milliseconds
-                           expr     min       lq     mean   median       uq
- Without telling col1 is sorted 16.8292 19.26230 20.28964 19.74155 21.03345
-         Telling col1 is sorted  2.1937  2.63195  3.06047  3.01305  3.29305
+                           expr     min       lq      mean   median       uq
+ Without telling col1 is sorted 11.7320 12.49360 12.775551 12.78765 13.02635
+         Telling col1 is sorted  1.8355  2.10485  2.289834  2.26110  2.42480
      max neval
- 28.6123   100
-  4.6473   100
+ 16.0328 100 + 2.9762 100
diff --git a/import_export.html b/import_export.html index 2df2d54..ab81f23 100644 --- a/import_export.html +++ b/import_export.html @@ -509,7 +509,7 @@

pl$read_csv("https://j.mp/iriscsv")
tmp file placed in 
- /tmp/Rtmpy20iyq/https...j.mp.iriscsv
+ /tmp/RtmpaxmmZX/https...j.mp.iriscsv
shape: (150, 5)
diff --git a/lazy_execution.html b/lazy_execution.html
index 13ddab5..e46a343 100644
--- a/lazy_execution.html
+++ b/lazy_execution.html
@@ -488,21 +488,21 @@ 

collect()

shape: (21, 3)
-β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”
-β”‚ region                     ┆ Gold     ┆ Platinium β”‚
-β”‚ ---                        ┆ ---      ┆ ---       β”‚
-β”‚ str                        ┆ f64      ┆ f64       β”‚
-β•žβ•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•ͺ══════════β•ͺ═══════════║
-β”‚ Languedoc-Roussillon       ┆ 54.0     ┆ null      β”‚
-β”‚ Lorraine                   ┆ null     ┆ 81.0      β”‚
-β”‚ Midi-PyrΓ©nΓ©es              ┆ 47.02069 ┆ null      β”‚
-β”‚ Provence-Alpes-CΓ΄te d'Azur ┆ 43.0     ┆ null      β”‚
-β”‚ …                          ┆ …        ┆ …         β”‚
-β”‚ Picardie                   ┆ 60.0     ┆ null      β”‚
-β”‚ Champagne-Ardenne          ┆ null     ┆ null      β”‚
-β”‚ Île-de-France              ┆ 40.0     ┆ 68.0      β”‚
-β”‚ Corse                      ┆ 76.0     ┆ null      β”‚
-β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜
+β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ region ┆ Gold ┆ Platinium β”‚ +β”‚ --- ┆ --- ┆ --- β”‚ +β”‚ str ┆ f64 ┆ f64 β”‚ +β•žβ•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•ͺ═══════════β•ͺ═══════════║ +β”‚ null ┆ 60.422772 ┆ 81.0 β”‚ +β”‚ RhΓ΄ne-Alpes ┆ 60.8004 ┆ 70.784124 β”‚ +β”‚ Corse ┆ null ┆ null β”‚ +β”‚ Bretagne ┆ 48.082977 ┆ null β”‚ +β”‚ … ┆ … ┆ … β”‚ +β”‚ Haute-Normandie ┆ null ┆ null β”‚ +β”‚ Picardie ┆ null ┆ null β”‚ +β”‚ Franche-ComtΓ© ┆ 60.0 ┆ null β”‚ +β”‚ Basse-Normandie ┆ null ┆ null β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜
@@ -528,21 +528,21 @@

filter( pl$col("region") == "Aquitaine")
-
shape: (1_044, 3)
+
shape: (6_233, 3)
 β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”
 β”‚ region    ┆ departement    ┆ priority β”‚
 β”‚ ---       ┆ ---            ┆ ---      β”‚
 β”‚ str       ┆ str            ┆ cat      β”‚
 β•žβ•β•β•β•β•β•β•β•β•β•β•β•ͺ════════════════β•ͺ══════════║
-β”‚ Aquitaine ┆ Lot-et-Garonne ┆ Gold     β”‚
-β”‚ Aquitaine ┆ Lot-et-Garonne ┆ Gold     β”‚
-β”‚ Aquitaine ┆ Lot-et-Garonne ┆ Gold     β”‚
-β”‚ Aquitaine ┆ Lot-et-Garonne ┆ Gold     β”‚
-β”‚ …         ┆ …              ┆ …        β”‚
 β”‚ Aquitaine ┆ Dordogne       ┆ Silver   β”‚
 β”‚ Aquitaine ┆ Dordogne       ┆ Silver   β”‚
-β”‚ Aquitaine ┆ Lot-et-Garonne ┆ Gold     β”‚
 β”‚ Aquitaine ┆ Dordogne       ┆ Silver   β”‚
+β”‚ Aquitaine ┆ Dordogne       ┆ Silver   β”‚
+β”‚ …         ┆ …              ┆ …        β”‚
+β”‚ Aquitaine ┆ Lot-et-Garonne ┆ Bronze   β”‚
+β”‚ Aquitaine ┆ Lot-et-Garonne ┆ Bronze   β”‚
+β”‚ Aquitaine ┆ null           ┆ Bronze   β”‚
+β”‚ Aquitaine ┆ Lot-et-Garonne ┆ Bronze   β”‚
 β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜
toc()
@@ -562,26 +562,26 @@

pl$col("region") == "Aquitaine")$ collect() # don't forget collect() here!
-
shape: (1_044, 3)
+
shape: (6_233, 3)
 β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”
 β”‚ region    ┆ departement    ┆ priority β”‚
 β”‚ ---       ┆ ---            ┆ ---      β”‚
 β”‚ str       ┆ str            ┆ cat      β”‚
 β•žβ•β•β•β•β•β•β•β•β•β•β•β•ͺ════════════════β•ͺ══════════║
-β”‚ Aquitaine ┆ Lot-et-Garonne ┆ Gold     β”‚
-β”‚ Aquitaine ┆ Lot-et-Garonne ┆ Gold     β”‚
-β”‚ Aquitaine ┆ Lot-et-Garonne ┆ Gold     β”‚
-β”‚ Aquitaine ┆ Lot-et-Garonne ┆ Gold     β”‚
-β”‚ …         ┆ …              ┆ …        β”‚
 β”‚ Aquitaine ┆ Dordogne       ┆ Silver   β”‚
 β”‚ Aquitaine ┆ Dordogne       ┆ Silver   β”‚
-β”‚ Aquitaine ┆ Lot-et-Garonne ┆ Gold     β”‚
 β”‚ Aquitaine ┆ Dordogne       ┆ Silver   β”‚
+β”‚ Aquitaine ┆ Dordogne       ┆ Silver   β”‚
+β”‚ …         ┆ …              ┆ …        β”‚
+β”‚ Aquitaine ┆ Lot-et-Garonne ┆ Bronze   β”‚
+β”‚ Aquitaine ┆ Lot-et-Garonne ┆ Bronze   β”‚
+β”‚ Aquitaine ┆ null           ┆ Bronze   β”‚
+β”‚ Aquitaine ┆ Lot-et-Garonne ┆ Bronze   β”‚
 β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜
toc()
-
0.097 sec elapsed
+
0.096 sec elapsed
@@ -607,20 +607,20 @@

toc()
-
0.133 sec elapsed
+
0.127 sec elapsed

… while the lazy method is represented by the pl$scan_csv():

@@ -643,15 +643,15 @@

toc()
@@ -677,19 +677,19 @@

+1 Côtes-d'Armor Silver 32 +2 Côtes-d'Armor Gold 54 +3 Côtes-d'Armor Bronze 18 +4 Finistère Gold 28 +5 Finistère Bronze 18 +6 Ille-et-Vilaine Silver 36.0 +7 Ille-et-Vilaine Platinium NaN +8 Morbihan Bronze 29.3 +9 <NA> Gold 67
toc()
-
0.335 sec elapsed
+
0.283 sec elapsed
@@ -709,20 +709,20 @@

toc()
-
0.017 sec elapsed
+
0.016 sec elapsed

And it’s another victory for the lazy execution!

@@ -750,19 +750,19 @@

+1 Côtes-d'Armor Silver 32 +2 Côtes-d'Armor Gold 54 +3 Côtes-d'Armor Bronze 18 +4 Finistère Gold 28 +5 Finistère Bronze 18 +6 Ille-et-Vilaine Silver 36.0 +7 Ille-et-Vilaine Platinium NaN +8 Morbihan Bronze 29.3 +9 <NA> Gold 67
toc()
-
0.151 sec elapsed
+
0.122 sec elapsed
diff --git a/search.json b/search.json index 5f8a579..41de449 100644 --- a/search.json +++ b/search.json @@ -102,7 +102,7 @@ "href": "data_manipulation.html#filter-rows", "title": "2Β  Data manipulation", "section": "2.4 Filter rows", - "text": "2.4 Filter rows\nThe first option to filter rows of a DataFrame is to use square brackets [] indexing (with integer row number).\n\ndata(iris)\n# The first four lines\npl$DataFrame(iris)[1:4]\n\nshape: (4, 5)\nβ”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”\nβ”‚ Sepal.Length ┆ Sepal.Width ┆ Petal.Length ┆ Petal.Width ┆ Species β”‚\nβ”‚ --- ┆ --- ┆ --- ┆ --- ┆ --- β”‚\nβ”‚ f64 ┆ f64 ┆ f64 ┆ f64 ┆ cat β”‚\nβ•žβ•β•β•β•β•β•β•β•β•β•β•β•β•β•β•ͺ═════════════β•ͺ══════════════β•ͺ═════════════β•ͺ═════════║\nβ”‚ 5.1 ┆ 3.5 ┆ 1.4 ┆ 0.2 ┆ setosa β”‚\nβ”‚ 4.9 ┆ 3.0 ┆ 1.4 ┆ 0.2 ┆ setosa β”‚\nβ”‚ 4.7 ┆ 3.2 ┆ 1.3 ┆ 0.2 ┆ setosa β”‚\nβ”‚ 4.6 ┆ 3.1 ┆ 1.5 ┆ 0.2 ┆ setosa β”‚\nβ””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜\n\n# The lines 1, 3 and 5\npl$DataFrame(iris)[c(1,3,5)]\n\nshape: (3, 5)\nβ”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”\nβ”‚ Sepal.Length ┆ Sepal.Width ┆ Petal.Length ┆ Petal.Width ┆ Species β”‚\nβ”‚ --- ┆ --- ┆ --- ┆ --- ┆ --- β”‚\nβ”‚ f64 ┆ f64 ┆ f64 ┆ f64 ┆ cat β”‚\nβ•žβ•β•β•β•β•β•β•β•β•β•β•β•β•β•β•ͺ═════════════β•ͺ══════════════β•ͺ═════════════β•ͺ═════════║\nβ”‚ 5.1 ┆ 3.5 ┆ 1.4 ┆ 0.2 ┆ setosa β”‚\nβ”‚ 4.7 ┆ 3.2 ┆ 1.3 ┆ 0.2 ┆ setosa β”‚\nβ”‚ 5.0 ┆ 3.6 ┆ 1.4 ┆ 0.2 ┆ setosa β”‚\nβ””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜\n\n\n\n\n\n\n\n\nImportant\n\n\n\nIt’s convenient when you have to quickly inspect your data. But you’ll quickly be limited by the square brackets, as they don’t accept conditions with the expressions. For example pl$DataFrame(iris)[Petal.Length > 6] doesn’t work.\n\n\nThe second and best option is to use the filter() method. It must be used with the Polars expression, here the col() method which allows to designate the columns on which the filter condition will be applied.\nLet’s see in details what’s inside a filter() method with an example:\n\npl$col(\"Petal.Length\"): this expression selects the Petal.Length column from iris;\n\n>6: applies a Boolean condition to this expression (for all Petals that have a length > 6).\n\nIn the example below, we will use & operator to apply multiple conditions in filter() method:\n\npolarsR basedplyrdata.table\n\n\n\npl$DataFrame(iris)$filter(\n pl$col(\"Petal.Length\") > 6 & pl$col(\"Petal.Width\") < 2)\n\nshape: (2, 5)\nβ”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”\nβ”‚ Sepal.Length ┆ Sepal.Width ┆ Petal.Length ┆ Petal.Width ┆ Species β”‚\nβ”‚ --- ┆ --- ┆ --- ┆ --- ┆ --- β”‚\nβ”‚ f64 ┆ f64 ┆ f64 ┆ f64 ┆ cat β”‚\nβ•žβ•β•β•β•β•β•β•β•β•β•β•β•β•β•β•ͺ═════════════β•ͺ══════════════β•ͺ═════════════β•ͺ═══════════║\nβ”‚ 7.3 ┆ 2.9 ┆ 6.3 ┆ 1.8 ┆ virginica β”‚\nβ”‚ 7.4 ┆ 2.8 ┆ 6.1 ┆ 1.9 ┆ virginica β”‚\nβ””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜\n\n\n\n\n\niris[iris$Petal.Length > 6 & iris$Petal.Width < 2,] # here don't forget the comma\n\n Sepal.Length Sepal.Width Petal.Length Petal.Width Species\n108 7.3 2.9 6.3 1.8 virginica\n131 7.4 2.8 6.1 1.9 virginica\n\n\n\n\n\niris |>\n filter(Petal.Length > 6 & Petal.Width < 2) \n\n Sepal.Length Sepal.Width Petal.Length Petal.Width Species\n1 7.3 2.9 6.3 1.8 virginica\n2 7.4 2.8 6.1 1.9 virginica\n\n\n\n\n\niris_dt[Petal.Length > 6 & Petal.Width < 2]\n\n Sepal.Length Sepal.Width Petal.Length Petal.Width Species\n1: 7.3 2.9 6.3 1.8 virginica\n2: 7.4 2.8 6.1 1.9 virginica\n\n\n\n\n\n\n\n\n\n\n\nTip\n\n\n\nThe equivalent of %in% R operator is the is_in() method. It should be used in association with the lit() method.\n\npl$DataFrame(\n colA = pl$Series(c(\"a\",\"b\",\"c\"))\n )$filter(\n pl$col(\"colA\")$is_in(pl$lit(c(\"a\",\"b\")))\n )\n\nshape: (2, 1)\nβ”Œβ”€β”€β”€β”€β”€β”€β”\nβ”‚ colA β”‚\nβ”‚ --- β”‚\nβ”‚ str β”‚\nβ•žβ•β•β•β•β•β•β•‘\nβ”‚ a β”‚\nβ”‚ b β”‚\nβ””β”€β”€β”€β”€β”€β”€β”˜\n\n\n\n\nAnother reason for using the filter() method is that filter expressions can be optimised in lazy mode by the query optimiser. Square brackets [] can only be used in eager mode.\n\n\n\n\n\n\nTip\n\n\n\nThere is another way to speed up filter processing on rows: tell polars that the column(s) used to filter rows are already sorted! To do this, you can use the set_sorted() method.\nHere’s an example:\n\nmydf <- pl$DataFrame(\n col1 = pl$Series(sort(runif(10000000)))\n)\n\nmicrobenchmark(\n \"Without telling col1 is sorted\" = mydf$filter(pl$col(\"col1\") < 100),\n \"Telling col1 is sorted\" = mydf$with_columns(pl$col(\"col1\")$set_sorted())$filter(pl$col(\"col1\") < 100)\n )\n\nUnit: milliseconds\n expr min lq mean median uq\n Without telling col1 is sorted 16.8292 19.26230 20.28964 19.74155 21.03345\n Telling col1 is sorted 2.1937 2.63195 3.06047 3.01305 3.29305\n max neval\n 28.6123 100\n 4.6473 100" + "text": "2.4 Filter rows\nThe first option to filter rows of a DataFrame is to use square brackets [] indexing (with integer row number).\n\ndata(iris)\n# The first four lines\npl$DataFrame(iris)[1:4]\n\nshape: (4, 5)\nβ”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”\nβ”‚ Sepal.Length ┆ Sepal.Width ┆ Petal.Length ┆ Petal.Width ┆ Species β”‚\nβ”‚ --- ┆ --- ┆ --- ┆ --- ┆ --- β”‚\nβ”‚ f64 ┆ f64 ┆ f64 ┆ f64 ┆ cat β”‚\nβ•žβ•β•β•β•β•β•β•β•β•β•β•β•β•β•β•ͺ═════════════β•ͺ══════════════β•ͺ═════════════β•ͺ═════════║\nβ”‚ 5.1 ┆ 3.5 ┆ 1.4 ┆ 0.2 ┆ setosa β”‚\nβ”‚ 4.9 ┆ 3.0 ┆ 1.4 ┆ 0.2 ┆ setosa β”‚\nβ”‚ 4.7 ┆ 3.2 ┆ 1.3 ┆ 0.2 ┆ setosa β”‚\nβ”‚ 4.6 ┆ 3.1 ┆ 1.5 ┆ 0.2 ┆ setosa β”‚\nβ””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜\n\n# The lines 1, 3 and 5\npl$DataFrame(iris)[c(1,3,5)]\n\nshape: (3, 5)\nβ”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”\nβ”‚ Sepal.Length ┆ Sepal.Width ┆ Petal.Length ┆ Petal.Width ┆ Species β”‚\nβ”‚ --- ┆ --- ┆ --- ┆ --- ┆ --- β”‚\nβ”‚ f64 ┆ f64 ┆ f64 ┆ f64 ┆ cat β”‚\nβ•žβ•β•β•β•β•β•β•β•β•β•β•β•β•β•β•ͺ═════════════β•ͺ══════════════β•ͺ═════════════β•ͺ═════════║\nβ”‚ 5.1 ┆ 3.5 ┆ 1.4 ┆ 0.2 ┆ setosa β”‚\nβ”‚ 4.7 ┆ 3.2 ┆ 1.3 ┆ 0.2 ┆ setosa β”‚\nβ”‚ 5.0 ┆ 3.6 ┆ 1.4 ┆ 0.2 ┆ setosa β”‚\nβ””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜\n\n\n\n\n\n\n\n\nImportant\n\n\n\nIt’s convenient when you have to quickly inspect your data. But you’ll quickly be limited by the square brackets, as they don’t accept conditions with the expressions. For example pl$DataFrame(iris)[Petal.Length > 6] doesn’t work.\n\n\nThe second and best option is to use the filter() method. It must be used with the Polars expression, here the col() method which allows to designate the columns on which the filter condition will be applied.\nLet’s see in details what’s inside a filter() method with an example:\n\npl$col(\"Petal.Length\"): this expression selects the Petal.Length column from iris;\n\n>6: applies a Boolean condition to this expression (for all Petals that have a length > 6).\n\nIn the example below, we will use & operator to apply multiple conditions in filter() method:\n\npolarsR basedplyrdata.table\n\n\n\npl$DataFrame(iris)$filter(\n pl$col(\"Petal.Length\") > 6 & pl$col(\"Petal.Width\") < 2)\n\nshape: (2, 5)\nβ”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”\nβ”‚ Sepal.Length ┆ Sepal.Width ┆ Petal.Length ┆ Petal.Width ┆ Species β”‚\nβ”‚ --- ┆ --- ┆ --- ┆ --- ┆ --- β”‚\nβ”‚ f64 ┆ f64 ┆ f64 ┆ f64 ┆ cat β”‚\nβ•žβ•β•β•β•β•β•β•β•β•β•β•β•β•β•β•ͺ═════════════β•ͺ══════════════β•ͺ═════════════β•ͺ═══════════║\nβ”‚ 7.3 ┆ 2.9 ┆ 6.3 ┆ 1.8 ┆ virginica β”‚\nβ”‚ 7.4 ┆ 2.8 ┆ 6.1 ┆ 1.9 ┆ virginica β”‚\nβ””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜\n\n\n\n\n\niris[iris$Petal.Length > 6 & iris$Petal.Width < 2,] # here don't forget the comma\n\n Sepal.Length Sepal.Width Petal.Length Petal.Width Species\n108 7.3 2.9 6.3 1.8 virginica\n131 7.4 2.8 6.1 1.9 virginica\n\n\n\n\n\niris |>\n filter(Petal.Length > 6 & Petal.Width < 2) \n\n Sepal.Length Sepal.Width Petal.Length Petal.Width Species\n1 7.3 2.9 6.3 1.8 virginica\n2 7.4 2.8 6.1 1.9 virginica\n\n\n\n\n\niris_dt[Petal.Length > 6 & Petal.Width < 2]\n\n Sepal.Length Sepal.Width Petal.Length Petal.Width Species\n1: 7.3 2.9 6.3 1.8 virginica\n2: 7.4 2.8 6.1 1.9 virginica\n\n\n\n\n\n\n\n\n\n\n\nTip\n\n\n\nThe equivalent of %in% R operator is the is_in() method. It should be used in association with the lit() method.\n\npl$DataFrame(\n colA = pl$Series(c(\"a\",\"b\",\"c\"))\n )$filter(\n pl$col(\"colA\")$is_in(pl$lit(c(\"a\",\"b\")))\n )\n\nshape: (2, 1)\nβ”Œβ”€β”€β”€β”€β”€β”€β”\nβ”‚ colA β”‚\nβ”‚ --- β”‚\nβ”‚ str β”‚\nβ•žβ•β•β•β•β•β•β•‘\nβ”‚ a β”‚\nβ”‚ b β”‚\nβ””β”€β”€β”€β”€β”€β”€β”˜\n\n\n\n\nAnother reason for using the filter() method is that filter expressions can be optimised in lazy mode by the query optimiser. Square brackets [] can only be used in eager mode.\n\n\n\n\n\n\nTip\n\n\n\nThere is another way to speed up filter processing on rows: tell polars that the column(s) used to filter rows are already sorted! To do this, you can use the set_sorted() method.\nHere’s an example:\n\nmydf <- pl$DataFrame(\n col1 = pl$Series(sort(runif(10000000)))\n)\n\nmicrobenchmark(\n \"Without telling col1 is sorted\" = mydf$filter(pl$col(\"col1\") < 100),\n \"Telling col1 is sorted\" = mydf$with_columns(pl$col(\"col1\")$set_sorted())$filter(pl$col(\"col1\") < 100)\n )\n\nUnit: milliseconds\n expr min lq mean median uq\n Without telling col1 is sorted 11.7320 12.49360 12.775551 12.78765 13.02635\n Telling col1 is sorted 1.8355 2.10485 2.289834 2.26110 2.42480\n max neval\n 16.0328 100\n 2.9762 100" }, { "objectID": "data_manipulation.html#select-columns", @@ -214,7 +214,7 @@ "href": "import_export.html#import-data", "title": "3Β  Import/Export", "section": "3.1 Import data", - "text": "3.1 Import data\n\n3.1.1 Read a csv file or URL\nThe read_csv() method can be used to import a csv file from a file or an URL. read_csv() returns a DataFrame.\nIts main arguments are:\n\npath: path to a file or URL.\n\nsep: single character to use as delimiter in the csv file.\n\nignore_errors: boolean. Indicate if the first row of dataset is a header or not. If set to FALSE, column names will be autogenerated in the following format: column_x, with x being an enumeration over every column in the dataset starting at 1.\n\nskip_rows: integer. Start reading after skip_rows lines. The header will be parsed at this offset.\n\nn_rows: integer. Stop reading after reading n_rows.\n\ncache: boolean. Cache the result after reading.\n\noverwrite_dtype: named list of dtypes where name points to a column. Can overwrite dtypes during inference.\n\nlow_memory: boolean. Reduce memory usage in expense of performance.\n\ncomment_char: single byte character used for csv quoting, default = β€œ. Set to NA to turn off special handling and escaping of quotes.\n\nnull_values: values to interpret as null values.\n\ninfer_schem_length: maximum number of rows to read to infer the column types. If set to 0, all columns will be read as UTF-8. If NULL, a full table scan will be done (slow).\n\nskip_rows_after_header: boolean. Skip this number of rows when the header is parsed.\n\nencoding: either β€œutf8” or β€œutf8-lossy”. Lossy means that invalid utf8 values are replaced with β€œ?” characters.\n\nrow_count_name: string. Name of a added row count column.\n\nrow_count_offset: integer. Offset to start the row_count column (only used if the name is set).\n\nparse_dates: boolean. Try to automatically parse dates. If this does not succeed, the column remains of data type Utf8.\n\nreuse_downloaded: boolean. If TRUE(default) and a URL was provided, cache the downloaded files in session for an easy reuse.\n\nBy default, polars takes the first row of the csv file as the header to set column names. If the first row is not a header, you can set the argument has_header = FALSE and the column names will be column_1, column_2…\n\n3.1.1.1 From a file\n\npolarsR base\n\n\n\npl$read_csv(\"examples/iris.csv\")\n\nshape: (150, 5)\nβ”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”\nβ”‚ Sepal.Length ┆ Sepal.Width ┆ Petal.Length ┆ Petal.Width ┆ Species β”‚\nβ”‚ --- ┆ --- ┆ --- ┆ --- ┆ --- β”‚\nβ”‚ f64 ┆ f64 ┆ f64 ┆ f64 ┆ str β”‚\nβ•žβ•β•β•β•β•β•β•β•β•β•β•β•β•β•β•ͺ═════════════β•ͺ══════════════β•ͺ═════════════β•ͺ═══════════║\nβ”‚ 5.1 ┆ 3.5 ┆ 1.4 ┆ 0.2 ┆ setosa β”‚\nβ”‚ 4.9 ┆ 3.0 ┆ 1.4 ┆ 0.2 ┆ setosa β”‚\nβ”‚ 4.7 ┆ 3.2 ┆ 1.3 ┆ 0.2 ┆ setosa β”‚\nβ”‚ 4.6 ┆ 3.1 ┆ 1.5 ┆ 0.2 ┆ setosa β”‚\nβ”‚ … ┆ … ┆ … ┆ … ┆ … β”‚\nβ”‚ 6.3 ┆ 2.5 ┆ 5.0 ┆ 1.9 ┆ virginica β”‚\nβ”‚ 6.5 ┆ 3.0 ┆ 5.2 ┆ 2.0 ┆ virginica β”‚\nβ”‚ 6.2 ┆ 3.4 ┆ 5.4 ┆ 2.3 ┆ virginica β”‚\nβ”‚ 5.9 ┆ 3.0 ┆ 5.1 ┆ 1.8 ┆ virginica β”‚\nβ””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜\n\n\n\n\n\nread.csv(\"examples/iris.csv\")\n\n Sepal.Length Sepal.Width Petal.Length Petal.Width Species\n1 5.1 3.5 1.4 0.2 setosa\n2 4.9 3.0 1.4 0.2 setosa\n3 4.7 3.2 1.3 0.2 setosa\n4 4.6 3.1 1.5 0.2 setosa\n5 5.0 3.6 1.4 0.2 setosa\n6 5.4 3.9 1.7 0.4 setosa\n7 4.6 3.4 1.4 0.3 setosa\n8 5.0 3.4 1.5 0.2 setosa\n9 4.4 2.9 1.4 0.2 setosa\n10 4.9 3.1 1.5 0.1 setosa\n11 5.4 3.7 1.5 0.2 setosa\n12 4.8 3.4 1.6 0.2 setosa\n13 4.8 3.0 1.4 0.1 setosa\n14 4.3 3.0 1.1 0.1 setosa\n15 5.8 4.0 1.2 0.2 setosa\n16 5.7 4.4 1.5 0.4 setosa\n17 5.4 3.9 1.3 0.4 setosa\n18 5.1 3.5 1.4 0.3 setosa\n19 5.7 3.8 1.7 0.3 setosa\n20 5.1 3.8 1.5 0.3 setosa\n21 5.4 3.4 1.7 0.2 setosa\n22 5.1 3.7 1.5 0.4 setosa\n23 4.6 3.6 1.0 0.2 setosa\n24 5.1 3.3 1.7 0.5 setosa\n25 4.8 3.4 1.9 0.2 setosa\n26 5.0 3.0 1.6 0.2 setosa\n27 5.0 3.4 1.6 0.4 setosa\n28 5.2 3.5 1.5 0.2 setosa\n29 5.2 3.4 1.4 0.2 setosa\n30 4.7 3.2 1.6 0.2 setosa\n31 4.8 3.1 1.6 0.2 setosa\n32 5.4 3.4 1.5 0.4 setosa\n33 5.2 4.1 1.5 0.1 setosa\n34 5.5 4.2 1.4 0.2 setosa\n35 4.9 3.1 1.5 0.2 setosa\n36 5.0 3.2 1.2 0.2 setosa\n37 5.5 3.5 1.3 0.2 setosa\n38 4.9 3.6 1.4 0.1 setosa\n39 4.4 3.0 1.3 0.2 setosa\n40 5.1 3.4 1.5 0.2 setosa\n41 5.0 3.5 1.3 0.3 setosa\n42 4.5 2.3 1.3 0.3 setosa\n43 4.4 3.2 1.3 0.2 setosa\n44 5.0 3.5 1.6 0.6 setosa\n45 5.1 3.8 1.9 0.4 setosa\n46 4.8 3.0 1.4 0.3 setosa\n47 5.1 3.8 1.6 0.2 setosa\n48 4.6 3.2 1.4 0.2 setosa\n49 5.3 3.7 1.5 0.2 setosa\n50 5.0 3.3 1.4 0.2 setosa\n51 7.0 3.2 4.7 1.4 versicolor\n52 6.4 3.2 4.5 1.5 versicolor\n53 6.9 3.1 4.9 1.5 versicolor\n54 5.5 2.3 4.0 1.3 versicolor\n55 6.5 2.8 4.6 1.5 versicolor\n56 5.7 2.8 4.5 1.3 versicolor\n57 6.3 3.3 4.7 1.6 versicolor\n58 4.9 2.4 3.3 1.0 versicolor\n59 6.6 2.9 4.6 1.3 versicolor\n60 5.2 2.7 3.9 1.4 versicolor\n61 5.0 2.0 3.5 1.0 versicolor\n62 5.9 3.0 4.2 1.5 versicolor\n63 6.0 2.2 4.0 1.0 versicolor\n64 6.1 2.9 4.7 1.4 versicolor\n65 5.6 2.9 3.6 1.3 versicolor\n66 6.7 3.1 4.4 1.4 versicolor\n67 5.6 3.0 4.5 1.5 versicolor\n68 5.8 2.7 4.1 1.0 versicolor\n69 6.2 2.2 4.5 1.5 versicolor\n70 5.6 2.5 3.9 1.1 versicolor\n71 5.9 3.2 4.8 1.8 versicolor\n72 6.1 2.8 4.0 1.3 versicolor\n73 6.3 2.5 4.9 1.5 versicolor\n74 6.1 2.8 4.7 1.2 versicolor\n75 6.4 2.9 4.3 1.3 versicolor\n76 6.6 3.0 4.4 1.4 versicolor\n77 6.8 2.8 4.8 1.4 versicolor\n78 6.7 3.0 5.0 1.7 versicolor\n79 6.0 2.9 4.5 1.5 versicolor\n80 5.7 2.6 3.5 1.0 versicolor\n81 5.5 2.4 3.8 1.1 versicolor\n82 5.5 2.4 3.7 1.0 versicolor\n83 5.8 2.7 3.9 1.2 versicolor\n84 6.0 2.7 5.1 1.6 versicolor\n85 5.4 3.0 4.5 1.5 versicolor\n86 6.0 3.4 4.5 1.6 versicolor\n87 6.7 3.1 4.7 1.5 versicolor\n88 6.3 2.3 4.4 1.3 versicolor\n89 5.6 3.0 4.1 1.3 versicolor\n90 5.5 2.5 4.0 1.3 versicolor\n91 5.5 2.6 4.4 1.2 versicolor\n92 6.1 3.0 4.6 1.4 versicolor\n93 5.8 2.6 4.0 1.2 versicolor\n94 5.0 2.3 3.3 1.0 versicolor\n95 5.6 2.7 4.2 1.3 versicolor\n96 5.7 3.0 4.2 1.2 versicolor\n97 5.7 2.9 4.2 1.3 versicolor\n98 6.2 2.9 4.3 1.3 versicolor\n99 5.1 2.5 3.0 1.1 versicolor\n100 5.7 2.8 4.1 1.3 versicolor\n101 6.3 3.3 6.0 2.5 virginica\n102 5.8 2.7 5.1 1.9 virginica\n103 7.1 3.0 5.9 2.1 virginica\n104 6.3 2.9 5.6 1.8 virginica\n105 6.5 3.0 5.8 2.2 virginica\n106 7.6 3.0 6.6 2.1 virginica\n107 4.9 2.5 4.5 1.7 virginica\n108 7.3 2.9 6.3 1.8 virginica\n109 6.7 2.5 5.8 1.8 virginica\n110 7.2 3.6 6.1 2.5 virginica\n111 6.5 3.2 5.1 2.0 virginica\n112 6.4 2.7 5.3 1.9 virginica\n113 6.8 3.0 5.5 2.1 virginica\n114 5.7 2.5 5.0 2.0 virginica\n115 5.8 2.8 5.1 2.4 virginica\n116 6.4 3.2 5.3 2.3 virginica\n117 6.5 3.0 5.5 1.8 virginica\n118 7.7 3.8 6.7 2.2 virginica\n119 7.7 2.6 6.9 2.3 virginica\n120 6.0 2.2 5.0 1.5 virginica\n121 6.9 3.2 5.7 2.3 virginica\n122 5.6 2.8 4.9 2.0 virginica\n123 7.7 2.8 6.7 2.0 virginica\n124 6.3 2.7 4.9 1.8 virginica\n125 6.7 3.3 5.7 2.1 virginica\n126 7.2 3.2 6.0 1.8 virginica\n127 6.2 2.8 4.8 1.8 virginica\n128 6.1 3.0 4.9 1.8 virginica\n129 6.4 2.8 5.6 2.1 virginica\n130 7.2 3.0 5.8 1.6 virginica\n131 7.4 2.8 6.1 1.9 virginica\n132 7.9 3.8 6.4 2.0 virginica\n133 6.4 2.8 5.6 2.2 virginica\n134 6.3 2.8 5.1 1.5 virginica\n135 6.1 2.6 5.6 1.4 virginica\n136 7.7 3.0 6.1 2.3 virginica\n137 6.3 3.4 5.6 2.4 virginica\n138 6.4 3.1 5.5 1.8 virginica\n139 6.0 3.0 4.8 1.8 virginica\n140 6.9 3.1 5.4 2.1 virginica\n141 6.7 3.1 5.6 2.4 virginica\n142 6.9 3.1 5.1 2.3 virginica\n143 5.8 2.7 5.1 1.9 virginica\n144 6.8 3.2 5.9 2.3 virginica\n145 6.7 3.3 5.7 2.5 virginica\n146 6.7 3.0 5.2 2.3 virginica\n147 6.3 2.5 5.0 1.9 virginica\n148 6.5 3.0 5.2 2.0 virginica\n149 6.2 3.4 5.4 2.3 virginica\n150 5.9 3.0 5.1 1.8 virginica\n\n\n\n\n\n\n\n3.1.1.2 From multiple files\nFirst, let’s create a dozen csv files\n\ndir.create(\"Datasets\")\nmydf <- data.frame(\n col1 = 1:3,\n col2 = c(\"a\", \"b\", \"c\")\n)\nfor (i in 1:10) {\n write.csv(mydf, file = paste0(\"Datasets/example_data_\",i,\".csv\"))\n}\n\n\n\n\n\n\n\nImportant\n\n\n\nJune 2023: Reading those multiple files into a single DataFrame is not yet implemented in R. See here for an example in Python.\n\n\n\n\n3.1.1.3 From an URL\nThe read_csv() method also works with an URL:\n\npl$read_csv(\"https://j.mp/iriscsv\")\n\ntmp file placed in \n /tmp/Rtmpy20iyq/https...j.mp.iriscsv\n\n\nshape: (150, 5)\nβ”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”\nβ”‚ sepal_length ┆ sepal_width ┆ petal_length ┆ petal_width ┆ species β”‚\nβ”‚ --- ┆ --- ┆ --- ┆ --- ┆ --- β”‚\nβ”‚ f64 ┆ f64 ┆ f64 ┆ f64 ┆ str β”‚\nβ•žβ•β•β•β•β•β•β•β•β•β•β•β•β•β•β•ͺ═════════════β•ͺ══════════════β•ͺ═════════════β•ͺ═══════════║\nβ”‚ 5.1 ┆ 3.5 ┆ 1.4 ┆ 0.2 ┆ setosa β”‚\nβ”‚ 4.9 ┆ 3.0 ┆ 1.4 ┆ 0.2 ┆ setosa β”‚\nβ”‚ 4.7 ┆ 3.2 ┆ 1.3 ┆ 0.2 ┆ setosa β”‚\nβ”‚ 4.6 ┆ 3.1 ┆ 1.5 ┆ 0.2 ┆ setosa β”‚\nβ”‚ … ┆ … ┆ … ┆ … ┆ … β”‚\nβ”‚ 6.3 ┆ 2.5 ┆ 5.0 ┆ 1.9 ┆ virginica β”‚\nβ”‚ 6.5 ┆ 3.0 ┆ 5.2 ┆ 2.0 ┆ virginica β”‚\nβ”‚ 6.2 ┆ 3.4 ┆ 5.4 ┆ 2.3 ┆ virginica β”‚\nβ”‚ 5.9 ┆ 3.0 ┆ 5.1 ┆ 1.8 ┆ virginica β”‚\nβ””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜\n\n\nπŸ‘‰ For a complete list of arguments to use with the read_csv() method, see this page.\n\n\n\n3.1.2 Scan a csv file\nThe scan_csv() method can be used to lazily read a csv file from a file.\npl$scan_csv() returns a LazyFrame.\nIt’s argument are the same as read_csv() method (see section above).\n\nThis allows the query optimizer to push down predicates and projections to the scan level, thereby potentially reducing memory overhead.\n\n\npl$scan_csv(\n \"examples/iris.csv\")$select( # lazy, don't do a thing\n pl$col(c(\"Petal.Length\",\"Petal.Width\")) # select only 2 columns\n )$\n filter(\n pl$col(\"Petal.Length\") > 4 # the filter is pushed down the scan, so less data is read into memory\n )$collect() # <- don't forget collect() here!\n\nshape: (84, 2)\nβ”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”\nβ”‚ Petal.Length ┆ Petal.Width β”‚\nβ”‚ --- ┆ --- β”‚\nβ”‚ f64 ┆ f64 β”‚\nβ•žβ•β•β•β•β•β•β•β•β•β•β•β•β•β•β•ͺ═════════════║\nβ”‚ 4.7 ┆ 1.4 β”‚\nβ”‚ 4.5 ┆ 1.5 β”‚\nβ”‚ 4.9 ┆ 1.5 β”‚\nβ”‚ 4.6 ┆ 1.5 β”‚\nβ”‚ … ┆ … β”‚\nβ”‚ 5.0 ┆ 1.9 β”‚\nβ”‚ 5.2 ┆ 2.0 β”‚\nβ”‚ 5.4 ┆ 2.3 β”‚\nβ”‚ 5.1 ┆ 1.8 β”‚\nβ””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜\n\n\nπŸ‘‰ For a complete list of arguments to use with the lazy_csv_reader() method, see this page.\n\n\n\n\n\n\nImportant\n\n\n\nJune 2023: arguments available in Python eol_char and with_column_names not yet supporting in R\n\n\n\n\n3.1.3 Scan a parquet file\n\n3.1.3.1 From a single file\nThe scan_parquet() method can be used to lazily read a parquet file from a file.\nScanning delays the actual parsing of the file and pl$scan_parquet() returns a LazyFrame.\nIts main arguments are:\n\npath: path to file.\n\nn_rows: integer. Limit rows to scan.\n\ncache: boolean. Cache the result.\n\nparallel: string. Either β€œAuto”, β€œNone”, β€œColumns” or β€œRowGroups”. The way to parallelized the scan.\n\nrechunk: boolean. rechunk reorganize memory layout, potentially make future operations faster , however perform reallocation now.\n\nrow_count_name: string. Name of a added row count column.\n\nrow_count_offset: integer. Offset to start the row_count column (only used if the name is set).\n\nlow_memory: boolean. Reduce memory usage in expense of performance.\n\n\npl$scan_parquet(\"examples/iris.parquet\")\n\n[1] \"polars LazyFrame naive plan: (run ldf$describe_optimized_plan() to see the optimized plan)\"\n\n PARQUET SCAN examples/iris.parquet\n PROJECT */5 COLUMNS\n\n\nπŸ‘‰ For a complete list of arguments to use with the scan_parquet() method, see this page.\nAt the end of the query, don’t forget to use the collect() method to inform Polars that you want to execute it.\n\npl$scan_parquet(\"examples/iris.parquet\")$\n collect()\n\nshape: (150, 5)\nβ”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”\nβ”‚ Sepal.Length ┆ Sepal.Width ┆ Petal.Length ┆ Petal.Width ┆ Species β”‚\nβ”‚ --- ┆ --- ┆ --- ┆ --- ┆ --- β”‚\nβ”‚ f64 ┆ f64 ┆ f64 ┆ f64 ┆ cat β”‚\nβ•žβ•β•β•β•β•β•β•β•β•β•β•β•β•β•β•ͺ═════════════β•ͺ══════════════β•ͺ═════════════β•ͺ═══════════║\nβ”‚ 5.1 ┆ 3.5 ┆ 1.4 ┆ 0.2 ┆ setosa β”‚\nβ”‚ 4.9 ┆ 3.0 ┆ 1.4 ┆ 0.2 ┆ setosa β”‚\nβ”‚ 4.7 ┆ 3.2 ┆ 1.3 ┆ 0.2 ┆ setosa β”‚\nβ”‚ 4.6 ┆ 3.1 ┆ 1.5 ┆ 0.2 ┆ setosa β”‚\nβ”‚ … ┆ … ┆ … ┆ … ┆ … β”‚\nβ”‚ 6.3 ┆ 2.5 ┆ 5.0 ┆ 1.9 ┆ virginica β”‚\nβ”‚ 6.5 ┆ 3.0 ┆ 5.2 ┆ 2.0 ┆ virginica β”‚\nβ”‚ 6.2 ┆ 3.4 ┆ 5.4 ┆ 2.3 ┆ virginica β”‚\nβ”‚ 5.9 ┆ 3.0 ┆ 5.1 ┆ 1.8 ┆ virginica β”‚\nβ””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜\n\n\n\n\n\n\n\n\nCaution\n\n\n\nAugust 2023 : Export methods have not yet been implemented in R. This methods start with write_ (write_parquet(), write_parquet(), write_json(), write_ndjson()…)\n\n\n\n\n3.1.3.2 From multiple files\nThe scan_parquet() method can also be used to lazily read multiple parquet files in the same folder.\nThis is particularly useful for partitioned files! For example:\n\n# Write multiple parquet files in examples folder\narrow::write_dataset(dataset = iris,\n path = \"examples\",\n partitioning = \"Species\")\n# Reading all parquet files in the example folder and its subfolders\npl$scan_parquet(\"examples/*/*.parquet\")$\n collect()\n\nshape: (150, 4)\nβ”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”\nβ”‚ Sepal.Length ┆ Sepal.Width ┆ Petal.Length ┆ Petal.Width β”‚\nβ”‚ --- ┆ --- ┆ --- ┆ --- β”‚\nβ”‚ f64 ┆ f64 ┆ f64 ┆ f64 β”‚\nβ•žβ•β•β•β•β•β•β•β•β•β•β•β•β•β•β•ͺ═════════════β•ͺ══════════════β•ͺ═════════════║\nβ”‚ 5.1 ┆ 3.5 ┆ 1.4 ┆ 0.2 β”‚\nβ”‚ 4.9 ┆ 3.0 ┆ 1.4 ┆ 0.2 β”‚\nβ”‚ 4.7 ┆ 3.2 ┆ 1.3 ┆ 0.2 β”‚\nβ”‚ 4.6 ┆ 3.1 ┆ 1.5 ┆ 0.2 β”‚\nβ”‚ … ┆ … ┆ … ┆ … β”‚\nβ”‚ 6.3 ┆ 2.5 ┆ 5.0 ┆ 1.9 β”‚\nβ”‚ 6.5 ┆ 3.0 ┆ 5.2 ┆ 2.0 β”‚\nβ”‚ 6.2 ┆ 3.4 ┆ 5.4 ┆ 2.3 β”‚\nβ”‚ 5.9 ┆ 3.0 ┆ 5.1 ┆ 1.8 β”‚\nβ””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜\n\n\nIn the code above:\n\n/* refers to all subfolders in the example folder\n\n/*.parquet refers to all files with a .parquet extension\n\n\n\n\n\n\n\nImportant\n\n\n\nIn this case, note that the Species column which been used for partitioning is missing\n\n\nOf course, the advantage of using pl$scan_parquet() is that you can query several partitioned files and retrieve the result of the query in R. See an example here." + "text": "3.1 Import data\n\n3.1.1 Read a csv file or URL\nThe read_csv() method can be used to import a csv file from a file or an URL. read_csv() returns a DataFrame.\nIts main arguments are:\n\npath: path to a file or URL.\n\nsep: single character to use as delimiter in the csv file.\n\nignore_errors: boolean. Indicate if the first row of dataset is a header or not. If set to FALSE, column names will be autogenerated in the following format: column_x, with x being an enumeration over every column in the dataset starting at 1.\n\nskip_rows: integer. Start reading after skip_rows lines. The header will be parsed at this offset.\n\nn_rows: integer. Stop reading after reading n_rows.\n\ncache: boolean. Cache the result after reading.\n\noverwrite_dtype: named list of dtypes where name points to a column. Can overwrite dtypes during inference.\n\nlow_memory: boolean. Reduce memory usage in expense of performance.\n\ncomment_char: single byte character used for csv quoting, default = β€œ. Set to NA to turn off special handling and escaping of quotes.\n\nnull_values: values to interpret as null values.\n\ninfer_schem_length: maximum number of rows to read to infer the column types. If set to 0, all columns will be read as UTF-8. If NULL, a full table scan will be done (slow).\n\nskip_rows_after_header: boolean. Skip this number of rows when the header is parsed.\n\nencoding: either β€œutf8” or β€œutf8-lossy”. Lossy means that invalid utf8 values are replaced with β€œ?” characters.\n\nrow_count_name: string. Name of a added row count column.\n\nrow_count_offset: integer. Offset to start the row_count column (only used if the name is set).\n\nparse_dates: boolean. Try to automatically parse dates. If this does not succeed, the column remains of data type Utf8.\n\nreuse_downloaded: boolean. If TRUE(default) and a URL was provided, cache the downloaded files in session for an easy reuse.\n\nBy default, polars takes the first row of the csv file as the header to set column names. If the first row is not a header, you can set the argument has_header = FALSE and the column names will be column_1, column_2…\n\n3.1.1.1 From a file\n\npolarsR base\n\n\n\npl$read_csv(\"examples/iris.csv\")\n\nshape: (150, 5)\nβ”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”\nβ”‚ Sepal.Length ┆ Sepal.Width ┆ Petal.Length ┆ Petal.Width ┆ Species β”‚\nβ”‚ --- ┆ --- ┆ --- ┆ --- ┆ --- β”‚\nβ”‚ f64 ┆ f64 ┆ f64 ┆ f64 ┆ str β”‚\nβ•žβ•β•β•β•β•β•β•β•β•β•β•β•β•β•β•ͺ═════════════β•ͺ══════════════β•ͺ═════════════β•ͺ═══════════║\nβ”‚ 5.1 ┆ 3.5 ┆ 1.4 ┆ 0.2 ┆ setosa β”‚\nβ”‚ 4.9 ┆ 3.0 ┆ 1.4 ┆ 0.2 ┆ setosa β”‚\nβ”‚ 4.7 ┆ 3.2 ┆ 1.3 ┆ 0.2 ┆ setosa β”‚\nβ”‚ 4.6 ┆ 3.1 ┆ 1.5 ┆ 0.2 ┆ setosa β”‚\nβ”‚ … ┆ … ┆ … ┆ … ┆ … β”‚\nβ”‚ 6.3 ┆ 2.5 ┆ 5.0 ┆ 1.9 ┆ virginica β”‚\nβ”‚ 6.5 ┆ 3.0 ┆ 5.2 ┆ 2.0 ┆ virginica β”‚\nβ”‚ 6.2 ┆ 3.4 ┆ 5.4 ┆ 2.3 ┆ virginica β”‚\nβ”‚ 5.9 ┆ 3.0 ┆ 5.1 ┆ 1.8 ┆ virginica β”‚\nβ””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜\n\n\n\n\n\nread.csv(\"examples/iris.csv\")\n\n Sepal.Length Sepal.Width Petal.Length Petal.Width Species\n1 5.1 3.5 1.4 0.2 setosa\n2 4.9 3.0 1.4 0.2 setosa\n3 4.7 3.2 1.3 0.2 setosa\n4 4.6 3.1 1.5 0.2 setosa\n5 5.0 3.6 1.4 0.2 setosa\n6 5.4 3.9 1.7 0.4 setosa\n7 4.6 3.4 1.4 0.3 setosa\n8 5.0 3.4 1.5 0.2 setosa\n9 4.4 2.9 1.4 0.2 setosa\n10 4.9 3.1 1.5 0.1 setosa\n11 5.4 3.7 1.5 0.2 setosa\n12 4.8 3.4 1.6 0.2 setosa\n13 4.8 3.0 1.4 0.1 setosa\n14 4.3 3.0 1.1 0.1 setosa\n15 5.8 4.0 1.2 0.2 setosa\n16 5.7 4.4 1.5 0.4 setosa\n17 5.4 3.9 1.3 0.4 setosa\n18 5.1 3.5 1.4 0.3 setosa\n19 5.7 3.8 1.7 0.3 setosa\n20 5.1 3.8 1.5 0.3 setosa\n21 5.4 3.4 1.7 0.2 setosa\n22 5.1 3.7 1.5 0.4 setosa\n23 4.6 3.6 1.0 0.2 setosa\n24 5.1 3.3 1.7 0.5 setosa\n25 4.8 3.4 1.9 0.2 setosa\n26 5.0 3.0 1.6 0.2 setosa\n27 5.0 3.4 1.6 0.4 setosa\n28 5.2 3.5 1.5 0.2 setosa\n29 5.2 3.4 1.4 0.2 setosa\n30 4.7 3.2 1.6 0.2 setosa\n31 4.8 3.1 1.6 0.2 setosa\n32 5.4 3.4 1.5 0.4 setosa\n33 5.2 4.1 1.5 0.1 setosa\n34 5.5 4.2 1.4 0.2 setosa\n35 4.9 3.1 1.5 0.2 setosa\n36 5.0 3.2 1.2 0.2 setosa\n37 5.5 3.5 1.3 0.2 setosa\n38 4.9 3.6 1.4 0.1 setosa\n39 4.4 3.0 1.3 0.2 setosa\n40 5.1 3.4 1.5 0.2 setosa\n41 5.0 3.5 1.3 0.3 setosa\n42 4.5 2.3 1.3 0.3 setosa\n43 4.4 3.2 1.3 0.2 setosa\n44 5.0 3.5 1.6 0.6 setosa\n45 5.1 3.8 1.9 0.4 setosa\n46 4.8 3.0 1.4 0.3 setosa\n47 5.1 3.8 1.6 0.2 setosa\n48 4.6 3.2 1.4 0.2 setosa\n49 5.3 3.7 1.5 0.2 setosa\n50 5.0 3.3 1.4 0.2 setosa\n51 7.0 3.2 4.7 1.4 versicolor\n52 6.4 3.2 4.5 1.5 versicolor\n53 6.9 3.1 4.9 1.5 versicolor\n54 5.5 2.3 4.0 1.3 versicolor\n55 6.5 2.8 4.6 1.5 versicolor\n56 5.7 2.8 4.5 1.3 versicolor\n57 6.3 3.3 4.7 1.6 versicolor\n58 4.9 2.4 3.3 1.0 versicolor\n59 6.6 2.9 4.6 1.3 versicolor\n60 5.2 2.7 3.9 1.4 versicolor\n61 5.0 2.0 3.5 1.0 versicolor\n62 5.9 3.0 4.2 1.5 versicolor\n63 6.0 2.2 4.0 1.0 versicolor\n64 6.1 2.9 4.7 1.4 versicolor\n65 5.6 2.9 3.6 1.3 versicolor\n66 6.7 3.1 4.4 1.4 versicolor\n67 5.6 3.0 4.5 1.5 versicolor\n68 5.8 2.7 4.1 1.0 versicolor\n69 6.2 2.2 4.5 1.5 versicolor\n70 5.6 2.5 3.9 1.1 versicolor\n71 5.9 3.2 4.8 1.8 versicolor\n72 6.1 2.8 4.0 1.3 versicolor\n73 6.3 2.5 4.9 1.5 versicolor\n74 6.1 2.8 4.7 1.2 versicolor\n75 6.4 2.9 4.3 1.3 versicolor\n76 6.6 3.0 4.4 1.4 versicolor\n77 6.8 2.8 4.8 1.4 versicolor\n78 6.7 3.0 5.0 1.7 versicolor\n79 6.0 2.9 4.5 1.5 versicolor\n80 5.7 2.6 3.5 1.0 versicolor\n81 5.5 2.4 3.8 1.1 versicolor\n82 5.5 2.4 3.7 1.0 versicolor\n83 5.8 2.7 3.9 1.2 versicolor\n84 6.0 2.7 5.1 1.6 versicolor\n85 5.4 3.0 4.5 1.5 versicolor\n86 6.0 3.4 4.5 1.6 versicolor\n87 6.7 3.1 4.7 1.5 versicolor\n88 6.3 2.3 4.4 1.3 versicolor\n89 5.6 3.0 4.1 1.3 versicolor\n90 5.5 2.5 4.0 1.3 versicolor\n91 5.5 2.6 4.4 1.2 versicolor\n92 6.1 3.0 4.6 1.4 versicolor\n93 5.8 2.6 4.0 1.2 versicolor\n94 5.0 2.3 3.3 1.0 versicolor\n95 5.6 2.7 4.2 1.3 versicolor\n96 5.7 3.0 4.2 1.2 versicolor\n97 5.7 2.9 4.2 1.3 versicolor\n98 6.2 2.9 4.3 1.3 versicolor\n99 5.1 2.5 3.0 1.1 versicolor\n100 5.7 2.8 4.1 1.3 versicolor\n101 6.3 3.3 6.0 2.5 virginica\n102 5.8 2.7 5.1 1.9 virginica\n103 7.1 3.0 5.9 2.1 virginica\n104 6.3 2.9 5.6 1.8 virginica\n105 6.5 3.0 5.8 2.2 virginica\n106 7.6 3.0 6.6 2.1 virginica\n107 4.9 2.5 4.5 1.7 virginica\n108 7.3 2.9 6.3 1.8 virginica\n109 6.7 2.5 5.8 1.8 virginica\n110 7.2 3.6 6.1 2.5 virginica\n111 6.5 3.2 5.1 2.0 virginica\n112 6.4 2.7 5.3 1.9 virginica\n113 6.8 3.0 5.5 2.1 virginica\n114 5.7 2.5 5.0 2.0 virginica\n115 5.8 2.8 5.1 2.4 virginica\n116 6.4 3.2 5.3 2.3 virginica\n117 6.5 3.0 5.5 1.8 virginica\n118 7.7 3.8 6.7 2.2 virginica\n119 7.7 2.6 6.9 2.3 virginica\n120 6.0 2.2 5.0 1.5 virginica\n121 6.9 3.2 5.7 2.3 virginica\n122 5.6 2.8 4.9 2.0 virginica\n123 7.7 2.8 6.7 2.0 virginica\n124 6.3 2.7 4.9 1.8 virginica\n125 6.7 3.3 5.7 2.1 virginica\n126 7.2 3.2 6.0 1.8 virginica\n127 6.2 2.8 4.8 1.8 virginica\n128 6.1 3.0 4.9 1.8 virginica\n129 6.4 2.8 5.6 2.1 virginica\n130 7.2 3.0 5.8 1.6 virginica\n131 7.4 2.8 6.1 1.9 virginica\n132 7.9 3.8 6.4 2.0 virginica\n133 6.4 2.8 5.6 2.2 virginica\n134 6.3 2.8 5.1 1.5 virginica\n135 6.1 2.6 5.6 1.4 virginica\n136 7.7 3.0 6.1 2.3 virginica\n137 6.3 3.4 5.6 2.4 virginica\n138 6.4 3.1 5.5 1.8 virginica\n139 6.0 3.0 4.8 1.8 virginica\n140 6.9 3.1 5.4 2.1 virginica\n141 6.7 3.1 5.6 2.4 virginica\n142 6.9 3.1 5.1 2.3 virginica\n143 5.8 2.7 5.1 1.9 virginica\n144 6.8 3.2 5.9 2.3 virginica\n145 6.7 3.3 5.7 2.5 virginica\n146 6.7 3.0 5.2 2.3 virginica\n147 6.3 2.5 5.0 1.9 virginica\n148 6.5 3.0 5.2 2.0 virginica\n149 6.2 3.4 5.4 2.3 virginica\n150 5.9 3.0 5.1 1.8 virginica\n\n\n\n\n\n\n\n3.1.1.2 From multiple files\nFirst, let’s create a dozen csv files\n\ndir.create(\"Datasets\")\nmydf <- data.frame(\n col1 = 1:3,\n col2 = c(\"a\", \"b\", \"c\")\n)\nfor (i in 1:10) {\n write.csv(mydf, file = paste0(\"Datasets/example_data_\",i,\".csv\"))\n}\n\n\n\n\n\n\n\nImportant\n\n\n\nJune 2023: Reading those multiple files into a single DataFrame is not yet implemented in R. See here for an example in Python.\n\n\n\n\n3.1.1.3 From an URL\nThe read_csv() method also works with an URL:\n\npl$read_csv(\"https://j.mp/iriscsv\")\n\ntmp file placed in \n /tmp/RtmpaxmmZX/https...j.mp.iriscsv\n\n\nshape: (150, 5)\nβ”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”\nβ”‚ sepal_length ┆ sepal_width ┆ petal_length ┆ petal_width ┆ species β”‚\nβ”‚ --- ┆ --- ┆ --- ┆ --- ┆ --- β”‚\nβ”‚ f64 ┆ f64 ┆ f64 ┆ f64 ┆ str β”‚\nβ•žβ•β•β•β•β•β•β•β•β•β•β•β•β•β•β•ͺ═════════════β•ͺ══════════════β•ͺ═════════════β•ͺ═══════════║\nβ”‚ 5.1 ┆ 3.5 ┆ 1.4 ┆ 0.2 ┆ setosa β”‚\nβ”‚ 4.9 ┆ 3.0 ┆ 1.4 ┆ 0.2 ┆ setosa β”‚\nβ”‚ 4.7 ┆ 3.2 ┆ 1.3 ┆ 0.2 ┆ setosa β”‚\nβ”‚ 4.6 ┆ 3.1 ┆ 1.5 ┆ 0.2 ┆ setosa β”‚\nβ”‚ … ┆ … ┆ … ┆ … ┆ … β”‚\nβ”‚ 6.3 ┆ 2.5 ┆ 5.0 ┆ 1.9 ┆ virginica β”‚\nβ”‚ 6.5 ┆ 3.0 ┆ 5.2 ┆ 2.0 ┆ virginica β”‚\nβ”‚ 6.2 ┆ 3.4 ┆ 5.4 ┆ 2.3 ┆ virginica β”‚\nβ”‚ 5.9 ┆ 3.0 ┆ 5.1 ┆ 1.8 ┆ virginica β”‚\nβ””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜\n\n\nπŸ‘‰ For a complete list of arguments to use with the read_csv() method, see this page.\n\n\n\n3.1.2 Scan a csv file\nThe scan_csv() method can be used to lazily read a csv file from a file.\npl$scan_csv() returns a LazyFrame.\nIt’s argument are the same as read_csv() method (see section above).\n\nThis allows the query optimizer to push down predicates and projections to the scan level, thereby potentially reducing memory overhead.\n\n\npl$scan_csv(\n \"examples/iris.csv\")$select( # lazy, don't do a thing\n pl$col(c(\"Petal.Length\",\"Petal.Width\")) # select only 2 columns\n )$\n filter(\n pl$col(\"Petal.Length\") > 4 # the filter is pushed down the scan, so less data is read into memory\n )$collect() # <- don't forget collect() here!\n\nshape: (84, 2)\nβ”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”\nβ”‚ Petal.Length ┆ Petal.Width β”‚\nβ”‚ --- ┆ --- β”‚\nβ”‚ f64 ┆ f64 β”‚\nβ•žβ•β•β•β•β•β•β•β•β•β•β•β•β•β•β•ͺ═════════════║\nβ”‚ 4.7 ┆ 1.4 β”‚\nβ”‚ 4.5 ┆ 1.5 β”‚\nβ”‚ 4.9 ┆ 1.5 β”‚\nβ”‚ 4.6 ┆ 1.5 β”‚\nβ”‚ … ┆ … β”‚\nβ”‚ 5.0 ┆ 1.9 β”‚\nβ”‚ 5.2 ┆ 2.0 β”‚\nβ”‚ 5.4 ┆ 2.3 β”‚\nβ”‚ 5.1 ┆ 1.8 β”‚\nβ””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜\n\n\nπŸ‘‰ For a complete list of arguments to use with the lazy_csv_reader() method, see this page.\n\n\n\n\n\n\nImportant\n\n\n\nJune 2023: arguments available in Python eol_char and with_column_names not yet supporting in R\n\n\n\n\n3.1.3 Scan a parquet file\n\n3.1.3.1 From a single file\nThe scan_parquet() method can be used to lazily read a parquet file from a file.\nScanning delays the actual parsing of the file and pl$scan_parquet() returns a LazyFrame.\nIts main arguments are:\n\npath: path to file.\n\nn_rows: integer. Limit rows to scan.\n\ncache: boolean. Cache the result.\n\nparallel: string. Either β€œAuto”, β€œNone”, β€œColumns” or β€œRowGroups”. The way to parallelized the scan.\n\nrechunk: boolean. rechunk reorganize memory layout, potentially make future operations faster , however perform reallocation now.\n\nrow_count_name: string. Name of a added row count column.\n\nrow_count_offset: integer. Offset to start the row_count column (only used if the name is set).\n\nlow_memory: boolean. Reduce memory usage in expense of performance.\n\n\npl$scan_parquet(\"examples/iris.parquet\")\n\n[1] \"polars LazyFrame naive plan: (run ldf$describe_optimized_plan() to see the optimized plan)\"\n\n PARQUET SCAN examples/iris.parquet\n PROJECT */5 COLUMNS\n\n\nπŸ‘‰ For a complete list of arguments to use with the scan_parquet() method, see this page.\nAt the end of the query, don’t forget to use the collect() method to inform Polars that you want to execute it.\n\npl$scan_parquet(\"examples/iris.parquet\")$\n collect()\n\nshape: (150, 5)\nβ”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”\nβ”‚ Sepal.Length ┆ Sepal.Width ┆ Petal.Length ┆ Petal.Width ┆ Species β”‚\nβ”‚ --- ┆ --- ┆ --- ┆ --- ┆ --- β”‚\nβ”‚ f64 ┆ f64 ┆ f64 ┆ f64 ┆ cat β”‚\nβ•žβ•β•β•β•β•β•β•β•β•β•β•β•β•β•β•ͺ═════════════β•ͺ══════════════β•ͺ═════════════β•ͺ═══════════║\nβ”‚ 5.1 ┆ 3.5 ┆ 1.4 ┆ 0.2 ┆ setosa β”‚\nβ”‚ 4.9 ┆ 3.0 ┆ 1.4 ┆ 0.2 ┆ setosa β”‚\nβ”‚ 4.7 ┆ 3.2 ┆ 1.3 ┆ 0.2 ┆ setosa β”‚\nβ”‚ 4.6 ┆ 3.1 ┆ 1.5 ┆ 0.2 ┆ setosa β”‚\nβ”‚ … ┆ … ┆ … ┆ … ┆ … β”‚\nβ”‚ 6.3 ┆ 2.5 ┆ 5.0 ┆ 1.9 ┆ virginica β”‚\nβ”‚ 6.5 ┆ 3.0 ┆ 5.2 ┆ 2.0 ┆ virginica β”‚\nβ”‚ 6.2 ┆ 3.4 ┆ 5.4 ┆ 2.3 ┆ virginica β”‚\nβ”‚ 5.9 ┆ 3.0 ┆ 5.1 ┆ 1.8 ┆ virginica β”‚\nβ””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜\n\n\n\n\n\n\n\n\nCaution\n\n\n\nAugust 2023 : Export methods have not yet been implemented in R. This methods start with write_ (write_parquet(), write_parquet(), write_json(), write_ndjson()…)\n\n\n\n\n3.1.3.2 From multiple files\nThe scan_parquet() method can also be used to lazily read multiple parquet files in the same folder.\nThis is particularly useful for partitioned files! For example:\n\n# Write multiple parquet files in examples folder\narrow::write_dataset(dataset = iris,\n path = \"examples\",\n partitioning = \"Species\")\n# Reading all parquet files in the example folder and its subfolders\npl$scan_parquet(\"examples/*/*.parquet\")$\n collect()\n\nshape: (150, 4)\nβ”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”\nβ”‚ Sepal.Length ┆ Sepal.Width ┆ Petal.Length ┆ Petal.Width β”‚\nβ”‚ --- ┆ --- ┆ --- ┆ --- β”‚\nβ”‚ f64 ┆ f64 ┆ f64 ┆ f64 β”‚\nβ•žβ•β•β•β•β•β•β•β•β•β•β•β•β•β•β•ͺ═════════════β•ͺ══════════════β•ͺ═════════════║\nβ”‚ 5.1 ┆ 3.5 ┆ 1.4 ┆ 0.2 β”‚\nβ”‚ 4.9 ┆ 3.0 ┆ 1.4 ┆ 0.2 β”‚\nβ”‚ 4.7 ┆ 3.2 ┆ 1.3 ┆ 0.2 β”‚\nβ”‚ 4.6 ┆ 3.1 ┆ 1.5 ┆ 0.2 β”‚\nβ”‚ … ┆ … ┆ … ┆ … β”‚\nβ”‚ 6.3 ┆ 2.5 ┆ 5.0 ┆ 1.9 β”‚\nβ”‚ 6.5 ┆ 3.0 ┆ 5.2 ┆ 2.0 β”‚\nβ”‚ 6.2 ┆ 3.4 ┆ 5.4 ┆ 2.3 β”‚\nβ”‚ 5.9 ┆ 3.0 ┆ 5.1 ┆ 1.8 β”‚\nβ””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜\n\n\nIn the code above:\n\n/* refers to all subfolders in the example folder\n\n/*.parquet refers to all files with a .parquet extension\n\n\n\n\n\n\n\nImportant\n\n\n\nIn this case, note that the Species column which been used for partitioning is missing\n\n\nOf course, the advantage of using pl$scan_parquet() is that you can query several partitioned files and retrieve the result of the query in R. See an example here." }, { "objectID": "import_export.html#export-data-to-excel", @@ -235,49 +235,49 @@ "href": "lazy_execution.html#lazy-vs-eager-mode-comparison", "title": "4Β  Lazy execution", "section": "4.2 Lazy vs eager mode comparison", - "text": "4.2 Lazy vs eager mode comparison\n\n4.2.1 General principles\nIn this first example we use the eager API:\n\ndf <- pl$read_csv(\"examples/iris.csv\")\ndf_small = df$filter(pl$col(\"Petal.Length\") > 5)\ndf_agg = df_small$groupby(\"Species\")$agg(pl$col(\"Petal.Width\")$median())\ndf_agg\n\nshape: (2, 2)\nβ”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”\nβ”‚ Species ┆ Petal.Width β”‚\nβ”‚ --- ┆ --- β”‚\nβ”‚ str ┆ f64 β”‚\nβ•žβ•β•β•β•β•β•β•β•β•β•β•β•β•ͺ═════════════║\nβ”‚ virginica ┆ 2.1 β”‚\nβ”‚ versicolor ┆ 1.6 β”‚\nβ””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜\n\n\nThis example:\n\nRead the iris dataset.\nFilter the dataset based on Petal.Length\nCalculate the median of the Petal.Width per Species\n\nEvery step is executed immediately returning the intermediate results. This can be very wastefull as we might do work or load extra data that is not being used.\nIf we instead used the lazy API and waited on execution untill all the steps are defined then the query planner could perform various optimizations. In this case:\n\nPredicate pushdown: Apply filters as early as possible while reading the dataset, thus only reading rows with sepal length greater than 5.\nProjection pushdown: Select only the columns that are needed while reading the dataset, thus removing the need to load additional columns\n\n\n\n\n\n\n\nTip\n\n\n\nTo consult the list of optimisations made by Polars on queries in lazy mode, see this page..\n\n\nHere is the equivalent code using the lazy API. At the end of the query, don’t forget to use the collect() method to inform Polars that you want to execute it.\n\npl$scan_csv(\"examples/iris.csv\")$\n filter(\n pl$col(\"Petal.Length\") > 5)$\n groupby(\"Species\")$\n agg(pl$col(\"Petal.Width\")$median())$\n collect() # <- don't forget collect() here!\n\nshape: (2, 2)\nβ”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”\nβ”‚ Species ┆ Petal.Width β”‚\nβ”‚ --- ┆ --- β”‚\nβ”‚ str ┆ f64 β”‚\nβ•žβ•β•β•β•β•β•β•β•β•β•β•β•β•ͺ═════════════║\nβ”‚ virginica ┆ 2.1 β”‚\nβ”‚ versicolor ┆ 1.6 β”‚\nβ””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜\n\n\n\n\n\n\n\n\nImportant\n\n\n\nUse lazy execution will signficantly lower the load on memory & CPU thus allowing you to fit bigger datasets in memory and process faster.\n\n\nThe next section will demonstrate this time saving. πŸ‘‡\n\n\n4.2.2 Limits of lazy mode\nThere are some operations that cannot be performed in lazy mode (whether in polars or other lazy frameworks such as SQL database). One limitation is that Polars needs to know the column names and dtypes at each step of the query plan.\nFor example, we can’t pivot() (see here) in lazy mode as the column names are data-dependant following a pivot. Indeed, when you have to pivot() a DataFrame your future columns names cannot be predicted because it depends on what it is actually in your datasets!\nWhen you have to do operations that can be done in lazy mode, the recommandation is: - Running your query in lazy mode as far as possible;\n- Evaluating this lazy query with collect() when you need a non-lazy method;\n- Running the non-lazy methods;\n- Calling lazy() on the output to continue in lazy mode.\nHere’s an example:\n\npl$scan_parquet(\"Datasets/fakir_file.parquet\")$\n # Call collect() because I need to pivot()\n collect()$\n pivot(\n index = \"region\",\n columns = \"priority\",\n values = \"age\", \n aggregate_function = \"mean\"\n )$\n # Continue in lazy mode\n lazy()$\n select(\n pl$col(c(\"region\",\"Gold\",\"Platinium\"))\n )$\n collect()\n\nshape: (21, 3)\nβ”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”\nβ”‚ region ┆ Gold ┆ Platinium β”‚\nβ”‚ --- ┆ --- ┆ --- β”‚\nβ”‚ str ┆ f64 ┆ f64 β”‚\nβ•žβ•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•ͺ══════════β•ͺ═══════════║\nβ”‚ Languedoc-Roussillon ┆ 54.0 ┆ null β”‚\nβ”‚ Lorraine ┆ null ┆ 81.0 β”‚\nβ”‚ Midi-PyrΓ©nΓ©es ┆ 47.02069 ┆ null β”‚\nβ”‚ Provence-Alpes-CΓ΄te d'Azur ┆ 43.0 ┆ null β”‚\nβ”‚ … ┆ … ┆ … β”‚\nβ”‚ Picardie ┆ 60.0 ┆ null β”‚\nβ”‚ Champagne-Ardenne ┆ null ┆ null β”‚\nβ”‚ Île-de-France ┆ 40.0 ┆ 68.0 β”‚\nβ”‚ Corse ┆ 76.0 ┆ null β”‚\nβ””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜" + "text": "4.2 Lazy vs eager mode comparison\n\n4.2.1 General principles\nIn this first example we use the eager API:\n\ndf <- pl$read_csv(\"examples/iris.csv\")\ndf_small = df$filter(pl$col(\"Petal.Length\") > 5)\ndf_agg = df_small$groupby(\"Species\")$agg(pl$col(\"Petal.Width\")$median())\ndf_agg\n\nshape: (2, 2)\nβ”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”\nβ”‚ Species ┆ Petal.Width β”‚\nβ”‚ --- ┆ --- β”‚\nβ”‚ str ┆ f64 β”‚\nβ•žβ•β•β•β•β•β•β•β•β•β•β•β•β•ͺ═════════════║\nβ”‚ virginica ┆ 2.1 β”‚\nβ”‚ versicolor ┆ 1.6 β”‚\nβ””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜\n\n\nThis example:\n\nRead the iris dataset.\nFilter the dataset based on Petal.Length\nCalculate the median of the Petal.Width per Species\n\nEvery step is executed immediately returning the intermediate results. This can be very wastefull as we might do work or load extra data that is not being used.\nIf we instead used the lazy API and waited on execution untill all the steps are defined then the query planner could perform various optimizations. In this case:\n\nPredicate pushdown: Apply filters as early as possible while reading the dataset, thus only reading rows with sepal length greater than 5.\nProjection pushdown: Select only the columns that are needed while reading the dataset, thus removing the need to load additional columns\n\n\n\n\n\n\n\nTip\n\n\n\nTo consult the list of optimisations made by Polars on queries in lazy mode, see this page..\n\n\nHere is the equivalent code using the lazy API. At the end of the query, don’t forget to use the collect() method to inform Polars that you want to execute it.\n\npl$scan_csv(\"examples/iris.csv\")$\n filter(\n pl$col(\"Petal.Length\") > 5)$\n groupby(\"Species\")$\n agg(pl$col(\"Petal.Width\")$median())$\n collect() # <- don't forget collect() here!\n\nshape: (2, 2)\nβ”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”\nβ”‚ Species ┆ Petal.Width β”‚\nβ”‚ --- ┆ --- β”‚\nβ”‚ str ┆ f64 β”‚\nβ•žβ•β•β•β•β•β•β•β•β•β•β•β•β•ͺ═════════════║\nβ”‚ virginica ┆ 2.1 β”‚\nβ”‚ versicolor ┆ 1.6 β”‚\nβ””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜\n\n\n\n\n\n\n\n\nImportant\n\n\n\nUse lazy execution will signficantly lower the load on memory & CPU thus allowing you to fit bigger datasets in memory and process faster.\n\n\nThe next section will demonstrate this time saving. πŸ‘‡\n\n\n4.2.2 Limits of lazy mode\nThere are some operations that cannot be performed in lazy mode (whether in polars or other lazy frameworks such as SQL database). One limitation is that Polars needs to know the column names and dtypes at each step of the query plan.\nFor example, we can’t pivot() (see here) in lazy mode as the column names are data-dependant following a pivot. Indeed, when you have to pivot() a DataFrame your future columns names cannot be predicted because it depends on what it is actually in your datasets!\nWhen you have to do operations that can be done in lazy mode, the recommandation is: - Running your query in lazy mode as far as possible;\n- Evaluating this lazy query with collect() when you need a non-lazy method;\n- Running the non-lazy methods;\n- Calling lazy() on the output to continue in lazy mode.\nHere’s an example:\n\npl$scan_parquet(\"Datasets/fakir_file.parquet\")$\n # Call collect() because I need to pivot()\n collect()$\n pivot(\n index = \"region\",\n columns = \"priority\",\n values = \"age\", \n aggregate_function = \"mean\"\n )$\n # Continue in lazy mode\n lazy()$\n select(\n pl$col(c(\"region\",\"Gold\",\"Platinium\"))\n )$\n collect()\n\nshape: (21, 3)\nβ”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”\nβ”‚ region ┆ Gold ┆ Platinium β”‚\nβ”‚ --- ┆ --- ┆ --- β”‚\nβ”‚ str ┆ f64 ┆ f64 β”‚\nβ•žβ•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•ͺ═══════════β•ͺ═══════════║\nβ”‚ null ┆ 60.422772 ┆ 81.0 β”‚\nβ”‚ RhΓ΄ne-Alpes ┆ 60.8004 ┆ 70.784124 β”‚\nβ”‚ Corse ┆ null ┆ null β”‚\nβ”‚ Bretagne ┆ 48.082977 ┆ null β”‚\nβ”‚ … ┆ … ┆ … β”‚\nβ”‚ Haute-Normandie ┆ null ┆ null β”‚\nβ”‚ Picardie ┆ null ┆ null β”‚\nβ”‚ Franche-ComtΓ© ┆ 60.0 ┆ null β”‚\nβ”‚ Basse-Normandie ┆ null ┆ null β”‚\nβ””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜" }, { "objectID": "lazy_execution.html#lazy-vs-eager-mode-fight", "href": "lazy_execution.html#lazy-vs-eager-mode-fight", "title": "4Β  Lazy execution", "section": "4.3 Lazy vs eager mode : fight! βš”οΈ", - "text": "4.3 Lazy vs eager mode : fight! βš”οΈ\nFor this fight, we’re going to use a fake dataset with 1 000 000 rows and 25 columns created with the {fakir} package. The code for creating this dataset is available at the beginning of this document.\nThis fight will take place over 3 rounds :\n\nWith an eager query versus a lazy query from a DataFrame\nWith an eager query versus a lazy query from a csv file\nWith an eager query versus a lazy query from a parquet file\n\n\n4.3.1 From a DataFrame\nFor this first round and as seen above, let’s start with a simple query from a DataFrame:\n\ntic()\n#| label: fight-eager_dataframe\npl$DataFrame(fake_data)$select(\n pl$col(c(\"region\",\"departement\",\"priority\")) \n )$\n filter(\n pl$col(\"region\") == \"Aquitaine\")\n\nshape: (1_044, 3)\nβ”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”\nβ”‚ region ┆ departement ┆ priority β”‚\nβ”‚ --- ┆ --- ┆ --- β”‚\nβ”‚ str ┆ str ┆ cat β”‚\nβ•žβ•β•β•β•β•β•β•β•β•β•β•β•ͺ════════════════β•ͺ══════════║\nβ”‚ Aquitaine ┆ Lot-et-Garonne ┆ Gold β”‚\nβ”‚ Aquitaine ┆ Lot-et-Garonne ┆ Gold β”‚\nβ”‚ Aquitaine ┆ Lot-et-Garonne ┆ Gold β”‚\nβ”‚ Aquitaine ┆ Lot-et-Garonne ┆ Gold β”‚\nβ”‚ … ┆ … ┆ … β”‚\nβ”‚ Aquitaine ┆ Dordogne ┆ Silver β”‚\nβ”‚ Aquitaine ┆ Dordogne ┆ Silver β”‚\nβ”‚ Aquitaine ┆ Lot-et-Garonne ┆ Gold β”‚\nβ”‚ Aquitaine ┆ Dordogne ┆ Silver β”‚\nβ””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜\n\ntoc()\n\n0.101 sec elapsed\n\n\nAs seen above, we’re going to use the lazy() method to convert a DataFrame to a LazyFrame:\n\ntic()\n#| label: fight-lazy_lazyframe\npl$DataFrame(fake_data)$lazy()$\n select(\n pl$col(c(\"region\",\"departement\",\"priority\")) \n )$\n filter(\n pl$col(\"region\") == \"Aquitaine\")$\n collect() # don't forget collect() here!\n\nshape: (1_044, 3)\nβ”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”\nβ”‚ region ┆ departement ┆ priority β”‚\nβ”‚ --- ┆ --- ┆ --- β”‚\nβ”‚ str ┆ str ┆ cat β”‚\nβ•žβ•β•β•β•β•β•β•β•β•β•β•β•ͺ════════════════β•ͺ══════════║\nβ”‚ Aquitaine ┆ Lot-et-Garonne ┆ Gold β”‚\nβ”‚ Aquitaine ┆ Lot-et-Garonne ┆ Gold β”‚\nβ”‚ Aquitaine ┆ Lot-et-Garonne ┆ Gold β”‚\nβ”‚ Aquitaine ┆ Lot-et-Garonne ┆ Gold β”‚\nβ”‚ … ┆ … ┆ … β”‚\nβ”‚ Aquitaine ┆ Dordogne ┆ Silver β”‚\nβ”‚ Aquitaine ┆ Dordogne ┆ Silver β”‚\nβ”‚ Aquitaine ┆ Lot-et-Garonne ┆ Gold β”‚\nβ”‚ Aquitaine ┆ Dordogne ┆ Silver β”‚\nβ””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜\n\ntoc()\n\n0.097 sec elapsed\n\n\n\n\n4.3.2 From a csv file\nNow, the eager mode is represented here by the read_csv() method…\n\ntic()\n#| label: fight-eager_read_csv\npl$read_csv(\"Datasets/fakir_file.csv\", infer_schema_length=0)$ \n select(\n pl$col(c(\"region\",\"departement\",\"priority\",\"age\")))$\n with_columns(\n pl$col(\"age\")$cast(pl$Int32,strict = FALSE))$\n filter(\n pl$col(\"region\") == \"Bretagne\")$\n groupby(\"departement\",\"priority\")$\n agg(pl$col(\"age\")$mean())\n\nshape: (9, 3)\nβ”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”\nβ”‚ departement ┆ priority ┆ age β”‚\nβ”‚ --- ┆ --- ┆ --- β”‚\nβ”‚ str ┆ str ┆ f64 β”‚\nβ•žβ•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•ͺ═══════════β•ͺ═══════════║\nβ”‚ CΓ΄tes-d'Armor ┆ Bronze ┆ 24.2 β”‚\nβ”‚ Ille-et-Vilaine ┆ Gold ┆ null β”‚\nβ”‚ Ille-et-Vilaine ┆ Silver ┆ 68.0 β”‚\nβ”‚ Morbihan ┆ Bronze ┆ 37.0 β”‚\nβ”‚ FinistΓ¨re ┆ Gold ┆ null β”‚\nβ”‚ Morbihan ┆ Silver ┆ 41.0 β”‚\nβ”‚ FinistΓ¨re ┆ Bronze ┆ 36.23828 β”‚\nβ”‚ NA ┆ Platinium ┆ 78.724672 β”‚\nβ”‚ Ille-et-Vilaine ┆ Bronze ┆ null β”‚\nβ””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜\n\ntoc()\n\n0.133 sec elapsed\n\n\n… while the lazy method is represented by the pl$scan_csv():\n\ntic()\npl$scan_csv(\"Datasets/fakir_file.csv\", infer_schema_length=0)$\n select(\n pl$col(c(\"region\",\"departement\",\"priority\",\"age\")))$\n with_columns(\n pl$col(\"age\")$cast(pl$Int32,strict = FALSE))$\n filter(\n pl$col(\"region\") == \"Bretagne\")$\n groupby(\"departement\",\"priority\")$\n agg(pl$col(\"age\")$mean())$\n collect()\n\nshape: (9, 3)\nβ”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”\nβ”‚ departement ┆ priority ┆ age β”‚\nβ”‚ --- ┆ --- ┆ --- β”‚\nβ”‚ str ┆ str ┆ f64 β”‚\nβ•žβ•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•ͺ═══════════β•ͺ═══════════║\nβ”‚ Ille-et-Vilaine ┆ Silver ┆ 68.0 β”‚\nβ”‚ Morbihan ┆ Silver ┆ 41.0 β”‚\nβ”‚ CΓ΄tes-d'Armor ┆ Bronze ┆ 24.2 β”‚\nβ”‚ Morbihan ┆ Bronze ┆ 37.0 β”‚\nβ”‚ FinistΓ¨re ┆ Bronze ┆ 36.23828 β”‚\nβ”‚ Ille-et-Vilaine ┆ Bronze ┆ null β”‚\nβ”‚ FinistΓ¨re ┆ Gold ┆ null β”‚\nβ”‚ NA ┆ Platinium ┆ 78.724672 β”‚\nβ”‚ Ille-et-Vilaine ┆ Gold ┆ null β”‚\nβ””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜\n\ntoc()\n\n0.047 sec elapsed\n\n\nWe can clearly see that we save a lot of time when executing the lazy version of the code!\n\n\n4.3.3 From a parquet file\nThe read_parquet() method has not been implemented in the R Polars package, but for this fight we will use arrow::read_parquet() and {dplyr} syntax, which will compete with pl$scan_parquet().\n\ntic()\narrow::read_parquet(\"Datasets/fakir_file.parquet\", as_data_frame = FALSE) |>\n filter(region == \"Bretagne\") |> \n group_by(departement,priority) |> \n summarise(mymean=mean(age, na.rm = TRUE)) |> \n arrange(departement) |>\n collect()\n\n# A tibble: 9 Γ— 3\n# Groups: departement [5]\n departement priority mymean\n <chr> <fct> <dbl>\n1 CΓ΄tes-d'Armor Bronze 24.2\n2 FinistΓ¨re Gold NaN \n3 FinistΓ¨re Bronze 36.2\n4 Ille-et-Vilaine Bronze NaN \n5 Ille-et-Vilaine Silver 68 \n6 Ille-et-Vilaine Gold NaN \n7 Morbihan Bronze 37 \n8 Morbihan Silver 41 \n9 <NA> Platinium 78.7\n\ntoc()\n\n0.335 sec elapsed\n\n\n\ntic()\npl$scan_parquet(\"Datasets/fakir_file.parquet\")$\n filter( \n pl$col(\"region\") == \"Bretagne\")$\n groupby(c(\"departement\",\"priority\"))$\n agg(\n pl$col(c(\"age\"))$mean()\n)$sort(\"departement\")$\n collect()\n\nshape: (9, 3)\nβ”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”\nβ”‚ departement ┆ priority ┆ age β”‚\nβ”‚ --- ┆ --- ┆ --- β”‚\nβ”‚ str ┆ cat ┆ f64 β”‚\nβ•žβ•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•ͺ═══════════β•ͺ═══════════║\nβ”‚ null ┆ Platinium ┆ 78.724672 β”‚\nβ”‚ CΓ΄tes-d'Armor ┆ Bronze ┆ 24.2 β”‚\nβ”‚ FinistΓ¨re ┆ Bronze ┆ 36.23828 β”‚\nβ”‚ FinistΓ¨re ┆ Gold ┆ null β”‚\nβ”‚ Ille-et-Vilaine ┆ Bronze ┆ null β”‚\nβ”‚ Ille-et-Vilaine ┆ Silver ┆ 68.0 β”‚\nβ”‚ Ille-et-Vilaine ┆ Gold ┆ null β”‚\nβ”‚ Morbihan ┆ Bronze ┆ 37.0 β”‚\nβ”‚ Morbihan ┆ Silver ┆ 41.0 β”‚\nβ””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜\n\ntoc()\n\n0.017 sec elapsed\n\n\nAnd it’s another victory for the lazy execution!\n\n\n\n\n\n\nImportant\n\n\n\nNote that the {arrow} package also have ability to scan parquet files in a lazy way with the arrow::open_dataset function.\n\ntic()\narrow::open_dataset(\"Datasets/fakir_file.parquet\") |> \n filter(region == \"Bretagne\") |>\n group_by(departement,priority) |> \n summarise(mymean=mean(age, na.rm = TRUE)) |>\n arrange(departement) |>\n collect()\n\n# A tibble: 9 Γ— 3\n# Groups: departement [5]\n departement priority mymean\n <chr> <fct> <dbl>\n1 CΓ΄tes-d'Armor Bronze 24.2\n2 FinistΓ¨re Gold NaN \n3 FinistΓ¨re Bronze 36.2\n4 Ille-et-Vilaine Bronze NaN \n5 Ille-et-Vilaine Silver 68 \n6 Ille-et-Vilaine Gold NaN \n7 Morbihan Bronze 37 \n8 Morbihan Silver 41 \n9 <NA> Platinium 78.7\n\ntoc()\n\n0.151 sec elapsed" + "text": "4.3 Lazy vs eager mode : fight! βš”οΈ\nFor this fight, we’re going to use a fake dataset with 1 000 000 rows and 25 columns created with the {fakir} package. The code for creating this dataset is available at the beginning of this document.\nThis fight will take place over 3 rounds :\n\nWith an eager query versus a lazy query from a DataFrame\nWith an eager query versus a lazy query from a csv file\nWith an eager query versus a lazy query from a parquet file\n\n\n4.3.1 From a DataFrame\nFor this first round and as seen above, let’s start with a simple query from a DataFrame:\n\ntic()\n#| label: fight-eager_dataframe\npl$DataFrame(fake_data)$select(\n pl$col(c(\"region\",\"departement\",\"priority\")) \n )$\n filter(\n pl$col(\"region\") == \"Aquitaine\")\n\nshape: (6_233, 3)\nβ”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”\nβ”‚ region ┆ departement ┆ priority β”‚\nβ”‚ --- ┆ --- ┆ --- β”‚\nβ”‚ str ┆ str ┆ cat β”‚\nβ•žβ•β•β•β•β•β•β•β•β•β•β•β•ͺ════════════════β•ͺ══════════║\nβ”‚ Aquitaine ┆ Dordogne ┆ Silver β”‚\nβ”‚ Aquitaine ┆ Dordogne ┆ Silver β”‚\nβ”‚ Aquitaine ┆ Dordogne ┆ Silver β”‚\nβ”‚ Aquitaine ┆ Dordogne ┆ Silver β”‚\nβ”‚ … ┆ … ┆ … β”‚\nβ”‚ Aquitaine ┆ Lot-et-Garonne ┆ Bronze β”‚\nβ”‚ Aquitaine ┆ Lot-et-Garonne ┆ Bronze β”‚\nβ”‚ Aquitaine ┆ null ┆ Bronze β”‚\nβ”‚ Aquitaine ┆ Lot-et-Garonne ┆ Bronze β”‚\nβ””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜\n\ntoc()\n\n0.101 sec elapsed\n\n\nAs seen above, we’re going to use the lazy() method to convert a DataFrame to a LazyFrame:\n\ntic()\n#| label: fight-lazy_lazyframe\npl$DataFrame(fake_data)$lazy()$\n select(\n pl$col(c(\"region\",\"departement\",\"priority\")) \n )$\n filter(\n pl$col(\"region\") == \"Aquitaine\")$\n collect() # don't forget collect() here!\n\nshape: (6_233, 3)\nβ”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”\nβ”‚ region ┆ departement ┆ priority β”‚\nβ”‚ --- ┆ --- ┆ --- β”‚\nβ”‚ str ┆ str ┆ cat β”‚\nβ•žβ•β•β•β•β•β•β•β•β•β•β•β•ͺ════════════════β•ͺ══════════║\nβ”‚ Aquitaine ┆ Dordogne ┆ Silver β”‚\nβ”‚ Aquitaine ┆ Dordogne ┆ Silver β”‚\nβ”‚ Aquitaine ┆ Dordogne ┆ Silver β”‚\nβ”‚ Aquitaine ┆ Dordogne ┆ Silver β”‚\nβ”‚ … ┆ … ┆ … β”‚\nβ”‚ Aquitaine ┆ Lot-et-Garonne ┆ Bronze β”‚\nβ”‚ Aquitaine ┆ Lot-et-Garonne ┆ Bronze β”‚\nβ”‚ Aquitaine ┆ null ┆ Bronze β”‚\nβ”‚ Aquitaine ┆ Lot-et-Garonne ┆ Bronze β”‚\nβ””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜\n\ntoc()\n\n0.096 sec elapsed\n\n\n\n\n4.3.2 From a csv file\nNow, the eager mode is represented here by the read_csv() method…\n\ntic()\n#| label: fight-eager_read_csv\npl$read_csv(\"Datasets/fakir_file.csv\", infer_schema_length=0)$ \n select(\n pl$col(c(\"region\",\"departement\",\"priority\",\"age\")))$\n with_columns(\n pl$col(\"age\")$cast(pl$Int32,strict = FALSE))$\n filter(\n pl$col(\"region\") == \"Bretagne\")$\n groupby(\"departement\",\"priority\")$\n agg(pl$col(\"age\")$mean())\n\nshape: (9, 3)\nβ”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”\nβ”‚ departement ┆ priority ┆ age β”‚\nβ”‚ --- ┆ --- ┆ --- β”‚\nβ”‚ str ┆ str ┆ f64 β”‚\nβ•žβ•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•ͺ═══════════β•ͺ═══════════║\nβ”‚ FinistΓ¨re ┆ Gold ┆ 28.0 β”‚\nβ”‚ Ille-et-Vilaine ┆ Platinium ┆ null β”‚\nβ”‚ Ille-et-Vilaine ┆ Silver ┆ 36.002427 β”‚\nβ”‚ CΓ΄tes-d'Armor ┆ Bronze ┆ 18.0 β”‚\nβ”‚ CΓ΄tes-d'Armor ┆ Silver ┆ 32.0 β”‚\nβ”‚ Morbihan ┆ Bronze ┆ 29.301381 β”‚\nβ”‚ CΓ΄tes-d'Armor ┆ Gold ┆ 54.0 β”‚\nβ”‚ FinistΓ¨re ┆ Bronze ┆ 18.0 β”‚\nβ”‚ NA ┆ Gold ┆ 67.0 β”‚\nβ””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜\n\ntoc()\n\n0.127 sec elapsed\n\n\n… while the lazy method is represented by the pl$scan_csv():\n\ntic()\npl$scan_csv(\"Datasets/fakir_file.csv\", infer_schema_length=0)$\n select(\n pl$col(c(\"region\",\"departement\",\"priority\",\"age\")))$\n with_columns(\n pl$col(\"age\")$cast(pl$Int32,strict = FALSE))$\n filter(\n pl$col(\"region\") == \"Bretagne\")$\n groupby(\"departement\",\"priority\")$\n agg(pl$col(\"age\")$mean())$\n collect()\n\nshape: (9, 3)\nβ”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”\nβ”‚ departement ┆ priority ┆ age β”‚\nβ”‚ --- ┆ --- ┆ --- β”‚\nβ”‚ str ┆ str ┆ f64 β”‚\nβ•žβ•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•ͺ═══════════β•ͺ═══════════║\nβ”‚ Ille-et-Vilaine ┆ Silver ┆ 36.002427 β”‚\nβ”‚ FinistΓ¨re ┆ Bronze ┆ 18.0 β”‚\nβ”‚ CΓ΄tes-d'Armor ┆ Silver ┆ 32.0 β”‚\nβ”‚ CΓ΄tes-d'Armor ┆ Bronze ┆ 18.0 β”‚\nβ”‚ Morbihan ┆ Bronze ┆ 29.301381 β”‚\nβ”‚ Ille-et-Vilaine ┆ Platinium ┆ null β”‚\nβ”‚ FinistΓ¨re ┆ Gold ┆ 28.0 β”‚\nβ”‚ CΓ΄tes-d'Armor ┆ Gold ┆ 54.0 β”‚\nβ”‚ NA ┆ Gold ┆ 67.0 β”‚\nβ””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜\n\ntoc()\n\n0.047 sec elapsed\n\n\nWe can clearly see that we save a lot of time when executing the lazy version of the code!\n\n\n4.3.3 From a parquet file\nThe read_parquet() method has not been implemented in the R Polars package, but for this fight we will use arrow::read_parquet() and {dplyr} syntax, which will compete with pl$scan_parquet().\n\ntic()\narrow::read_parquet(\"Datasets/fakir_file.parquet\", as_data_frame = FALSE) |>\n filter(region == \"Bretagne\") |> \n group_by(departement,priority) |> \n summarise(mymean=mean(age, na.rm = TRUE)) |> \n arrange(departement) |>\n collect()\n\n# A tibble: 9 Γ— 3\n# Groups: departement [5]\n departement priority mymean\n <chr> <fct> <dbl>\n1 CΓ΄tes-d'Armor Silver 32 \n2 CΓ΄tes-d'Armor Gold 54 \n3 CΓ΄tes-d'Armor Bronze 18 \n4 FinistΓ¨re Gold 28 \n5 FinistΓ¨re Bronze 18 \n6 Ille-et-Vilaine Silver 36.0\n7 Ille-et-Vilaine Platinium NaN \n8 Morbihan Bronze 29.3\n9 <NA> Gold 67 \n\ntoc()\n\n0.283 sec elapsed\n\n\n\ntic()\npl$scan_parquet(\"Datasets/fakir_file.parquet\")$\n filter( \n pl$col(\"region\") == \"Bretagne\")$\n groupby(c(\"departement\",\"priority\"))$\n agg(\n pl$col(c(\"age\"))$mean()\n)$sort(\"departement\")$\n collect()\n\nshape: (9, 3)\nβ”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”\nβ”‚ departement ┆ priority ┆ age β”‚\nβ”‚ --- ┆ --- ┆ --- β”‚\nβ”‚ str ┆ cat ┆ f64 β”‚\nβ•žβ•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•ͺ═══════════β•ͺ═══════════║\nβ”‚ null ┆ Gold ┆ 67.0 β”‚\nβ”‚ CΓ΄tes-d'Armor ┆ Silver ┆ 32.0 β”‚\nβ”‚ CΓ΄tes-d'Armor ┆ Gold ┆ 54.0 β”‚\nβ”‚ CΓ΄tes-d'Armor ┆ Bronze ┆ 18.0 β”‚\nβ”‚ FinistΓ¨re ┆ Gold ┆ 28.0 β”‚\nβ”‚ FinistΓ¨re ┆ Bronze ┆ 18.0 β”‚\nβ”‚ Ille-et-Vilaine ┆ Silver ┆ 36.002427 β”‚\nβ”‚ Ille-et-Vilaine ┆ Platinium ┆ null β”‚\nβ”‚ Morbihan ┆ Bronze ┆ 29.301381 β”‚\nβ””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜\n\ntoc()\n\n0.016 sec elapsed\n\n\nAnd it’s another victory for the lazy execution!\n\n\n\n\n\n\nImportant\n\n\n\nNote that the {arrow} package also have ability to scan parquet files in a lazy way with the arrow::open_dataset function.\n\ntic()\narrow::open_dataset(\"Datasets/fakir_file.parquet\") |> \n filter(region == \"Bretagne\") |>\n group_by(departement,priority) |> \n summarise(mymean=mean(age, na.rm = TRUE)) |>\n arrange(departement) |>\n collect()\n\n# A tibble: 9 Γ— 3\n# Groups: departement [5]\n departement priority mymean\n <chr> <fct> <dbl>\n1 CΓ΄tes-d'Armor Silver 32 \n2 CΓ΄tes-d'Armor Gold 54 \n3 CΓ΄tes-d'Armor Bronze 18 \n4 FinistΓ¨re Gold 28 \n5 FinistΓ¨re Bronze 18 \n6 Ille-et-Vilaine Silver 36.0\n7 Ille-et-Vilaine Platinium NaN \n8 Morbihan Bronze 29.3\n9 <NA> Gold 67 \n\ntoc()\n\n0.122 sec elapsed" }, { "objectID": "benchmarking.html#from-an-r-object", "href": "benchmarking.html#from-an-r-object", "title": "5Β  Benchmarking", "section": "5.1 From an R object", - "text": "5.1 From an R object\nThis section analyses the different methods for making a query from an R object already loaded in memory.\nLet’s start by comparing polars with R base, dplyr and data.table.\n\npolarsR basedplyrdata.table\n\n\n\nrobject_polars <- function() {\n \n DataMultiTypes_pl$\n # Filter rows\n filter(\n pl$col(\"colInt\")>2000 & pl$col(\"colInt\")<8000\n )$\n # Grouping and aggregation\n groupby(\n \"colString\")$\n agg(\n pl$col(\"colInt\")$min()$alias(\"min_colInt\"),\n pl$col(\"colInt\")$mean()$alias(\"mean_colInt\"),\n pl$col(\"colInt\")$max()$alias(\"max_colInt\"),\n pl$col(\"colNum\")$min()$alias(\"min_colNum\"),\n pl$col(\"colNum\")$mean()$alias(\"mean_colNum\"),\n pl$col(\"colNum\")$max()$alias(\"max_colNum\")\n )\n}\n\n\n\n\nrobject_rbase <- function() {\n \n # Grouping and aggregation from data filtered\n aggregate(cbind(colInt, colNum) ~ colString, \n data = DataMultiTypes[DataMultiTypes$colInt>2000 & DataMultiTypes$colInt<8000,], \n FUN = function(x) c(mean = mean(x), \n min = min(x), \n max = max(x)))\n \n}\n\n\n\n\nrobject_dplyr <- function() {\n \n DataMultiTypes |>\n \n # Filter rows\n filter(\n colInt>2000 & colInt<8000\n ) |>\n \n # Grouping and aggregation\n group_by(colString) |> \n \n summarise(\n min_colInt = min(colInt),\n mean_colInt = mean(colInt),\n mas_colInt = max(colInt),\n min_colNum = min(colNum),\n mean_colNum = mean(colNum),\n max_colNum = max(colNum)\n )\n\n}\n\n\n\n\nrobject_dt <- function() {\n \n as.data.table(DataMultiTypes)[\n \n colInt > 2000 & colInt < 8000\n \n ][, .(min_colInt = min(colInt),\n mean_colInt = mean(colInt),\n mas_colInt = max(colInt),\n min_colNum = min(colNum),\n mean_colNum = mean(colNum),\n max_colNum = max(colNum)),\n \n by = colString\n ]\n}\n\n\n\n\nNow let’s look at how to use the DuckDb engine on R objects.\nThere are 3 main possibilities:\n\nTo use the DuckDB engine to query a R object with dplyr, you can use the duckdb::duckdb_register() method and then the dplyr::tbl() method to pass your dplyr instructions (dplyr/DuckDB).\nTo use the DuckDB engine to query a R object with the standard DBI methods, you can use the duckdb::duckdb_register() method and then the DBI::dbGetQuery() method to pass your SQL query (SQL/DuckDB).\nTo use the DuckDB engine to query a R object in combination with {arrow} package, you can use the arrow::to_duckdb() and then pass your dplyr instructions (dplyr/arrow/DuckDB).\n\n\ndplyr/DuckDBSQL/DuckDBdplyr/arrow/DuckDB\n\n\n\nrobject_duckdb_dplyr <- function(variables) {\n \n con <- DBI::dbConnect(duckdb::duckdb())\n\n duckdb::duckdb_register(con, \"DataMultiTypes\", DataMultiTypes)\n\n tbl(con, \"DataMultiTypes\") |>\n \n # Filter rows\n filter(\n colInt>2000 & colInt<8000\n ) |>\n # Grouping and aggregation\n group_by(colString) |> \n summarise(\n min_colInt = min(colInt, na.rm = TRUE),\n mean_colInt = mean(colInt, na.rm = TRUE),\n mas_colInt = max(colInt, na.rm = TRUE),\n min_colNum = min(colNum, na.rm = TRUE),\n mean_colNum = mean(colNum, na.rm = TRUE),\n max_colNum = max(colNum, na.rm = TRUE)\n ) |>\n collect()\n \n DBI::dbDisconnect(con, shutdown=TRUE)\n \n}\n\n\n\n\nrobject_duckdb_sql <- function(variables) {\n \n con <- DBI::dbConnect(duckdb::duckdb())\n\n duckdb::duckdb_register(con, \"DataMultiTypes\", DataMultiTypes)\n\n DBI::dbGetQuery(\n con, \n \"SELECT colString,\n MIN(colInt) AS min_colInt,\n AVG(colInt) AS mean_colInt,\n MAX(colInt) AS max_colInt,\n MIN(colNum) AS min_colNum,\n AVG(colNum) AS mean_colNum,\n MAX(colNum) AS max_colNum\n FROM (\n SELECT colString,\n colInt,\n colNum\n FROM DataMultiTypes\n WHERE colInt > 2000 AND colInt < 8000\n) AS filtered_data\nGROUP BY colString;\")\n \n DBI::dbDisconnect(con, shutdown=TRUE)\n \n}\n\n\n\n\nrobject_duckdb_arrow_dplyr <- function(variables) {\n \n DataMultiTypes |>\n \n to_duckdb() |>\n \n # Filter rows\n filter(\n colInt>2000 & colInt<8000\n ) |>\n # Grouping and aggregation\n group_by(colString) |> \n \n summarise(\n min_colInt = min(colInt, na.rm = TRUE),\n mean_colInt = mean(colInt, na.rm = TRUE),\n mas_colInt = max(colInt, na.rm = TRUE),\n min_colNum = min(colNum, na.rm = TRUE),\n mean_colNum = mean(colNum, na.rm = TRUE),\n max_colNum = max(colNum, na.rm = TRUE)\n ) \n \n}\n\n\n\n\n\n\n\n\n\n\nTip\n\n\n\nOne of the advantages of using the DuckDB engine and dplyr may be to use a feature implemented by DuckDB but not yet by Arrow. We can do the opposite, and return to the Arrow engine with arrow::to_arrow().\nHowever, the benchmark results are clear: SQL queries are by far the fastest! πŸ†\n\n\n\n5.1.1 Results with a R object\n\nmicrobenchmark(\n robject_polars(),\n robject_rbase(),\n robject_dplyr(),\n robject_dt(),\n robject_duckdb_dplyr(),\n robject_duckdb_sql(),\n robject_duckdb_arrow_dplyr(),\n times = 5\n ) \n\nUnit: milliseconds\n expr min lq mean median uq\n robject_polars() 27.1281 30.1365 35.44288 32.5842 38.7489\n robject_rbase() 216.0771 228.6539 237.63752 239.9291 248.2713\n robject_dplyr() 32.2709 34.4209 46.88596 49.5437 58.3165\n robject_dt() 45.0771 46.3848 59.67866 57.2030 68.3662\n robject_duckdb_dplyr() 376.9727 397.6940 403.66032 400.4190 411.9787\n robject_duckdb_sql() 80.6400 86.6684 89.71994 90.3671 91.0371\n robject_duckdb_arrow_dplyr() 292.8251 305.6941 333.98948 319.2016 321.3135\n max neval\n 48.6167 5\n 255.2562 5\n 59.8778 5\n 81.3622 5\n 431.2372 5\n 99.8871 5\n 430.9131 5\n\n\nπŸ‘‰ Conclusion of this little benchmark using R objects already loaded in memory: the fastest to run are polars and dplyr followed closely by data.table. πŸ†πŸ†πŸ†\nThe worst performer is surprisingly duckdb with the dplyr syntax, while duckdb with the SQL language does very well and comes 4th in this ranking." + "text": "5.1 From an R object\nThis section analyses the different methods for making a query from an R object already loaded in memory.\nLet’s start by comparing polars with R base, dplyr and data.table.\n\npolarsR basedplyrdata.table\n\n\n\nrobject_polars <- function() {\n \n DataMultiTypes_pl$\n # Filter rows\n filter(\n pl$col(\"colInt\")>2000 & pl$col(\"colInt\")<8000\n )$\n # Grouping and aggregation\n groupby(\n \"colString\")$\n agg(\n pl$col(\"colInt\")$min()$alias(\"min_colInt\"),\n pl$col(\"colInt\")$mean()$alias(\"mean_colInt\"),\n pl$col(\"colInt\")$max()$alias(\"max_colInt\"),\n pl$col(\"colNum\")$min()$alias(\"min_colNum\"),\n pl$col(\"colNum\")$mean()$alias(\"mean_colNum\"),\n pl$col(\"colNum\")$max()$alias(\"max_colNum\")\n )\n}\n\n\n\n\nrobject_rbase <- function() {\n \n # Grouping and aggregation from data filtered\n aggregate(cbind(colInt, colNum) ~ colString, \n data = DataMultiTypes[DataMultiTypes$colInt>2000 & DataMultiTypes$colInt<8000,], \n FUN = function(x) c(mean = mean(x), \n min = min(x), \n max = max(x)))\n \n}\n\n\n\n\nrobject_dplyr <- function() {\n \n DataMultiTypes |>\n \n # Filter rows\n filter(\n colInt>2000 & colInt<8000\n ) |>\n \n # Grouping and aggregation\n group_by(colString) |> \n \n summarise(\n min_colInt = min(colInt),\n mean_colInt = mean(colInt),\n mas_colInt = max(colInt),\n min_colNum = min(colNum),\n mean_colNum = mean(colNum),\n max_colNum = max(colNum)\n )\n\n}\n\n\n\n\nrobject_dt <- function() {\n \n as.data.table(DataMultiTypes)[\n \n colInt > 2000 & colInt < 8000\n \n ][, .(min_colInt = min(colInt),\n mean_colInt = mean(colInt),\n mas_colInt = max(colInt),\n min_colNum = min(colNum),\n mean_colNum = mean(colNum),\n max_colNum = max(colNum)),\n \n by = colString\n ]\n}\n\n\n\n\nNow let’s look at how to use the DuckDb engine on R objects.\nThere are 3 main possibilities:\n\nTo use the DuckDB engine to query a R object with dplyr, you can use the duckdb::duckdb_register() method and then the dplyr::tbl() method to pass your dplyr instructions (dplyr/DuckDB).\nTo use the DuckDB engine to query a R object with the standard DBI methods, you can use the duckdb::duckdb_register() method and then the DBI::dbGetQuery() method to pass your SQL query (SQL/DuckDB).\nTo use the DuckDB engine to query a R object in combination with {arrow} package, you can use the arrow::to_duckdb() and then pass your dplyr instructions (dplyr/arrow/DuckDB).\n\n\ndplyr/DuckDBSQL/DuckDBdplyr/arrow/DuckDB\n\n\n\nrobject_duckdb_dplyr <- function(variables) {\n \n con <- DBI::dbConnect(duckdb::duckdb())\n\n duckdb::duckdb_register(con, \"DataMultiTypes\", DataMultiTypes)\n\n tbl(con, \"DataMultiTypes\") |>\n \n # Filter rows\n filter(\n colInt>2000 & colInt<8000\n ) |>\n # Grouping and aggregation\n group_by(colString) |> \n summarise(\n min_colInt = min(colInt, na.rm = TRUE),\n mean_colInt = mean(colInt, na.rm = TRUE),\n mas_colInt = max(colInt, na.rm = TRUE),\n min_colNum = min(colNum, na.rm = TRUE),\n mean_colNum = mean(colNum, na.rm = TRUE),\n max_colNum = max(colNum, na.rm = TRUE)\n ) |>\n collect()\n \n DBI::dbDisconnect(con, shutdown=TRUE)\n \n}\n\n\n\n\nrobject_duckdb_sql <- function(variables) {\n \n con <- DBI::dbConnect(duckdb::duckdb())\n\n duckdb::duckdb_register(con, \"DataMultiTypes\", DataMultiTypes)\n\n DBI::dbGetQuery(\n con, \n \"SELECT colString,\n MIN(colInt) AS min_colInt,\n AVG(colInt) AS mean_colInt,\n MAX(colInt) AS max_colInt,\n MIN(colNum) AS min_colNum,\n AVG(colNum) AS mean_colNum,\n MAX(colNum) AS max_colNum\n FROM (\n SELECT colString,\n colInt,\n colNum\n FROM DataMultiTypes\n WHERE colInt > 2000 AND colInt < 8000\n) AS filtered_data\nGROUP BY colString;\")\n \n DBI::dbDisconnect(con, shutdown=TRUE)\n \n}\n\n\n\n\nrobject_duckdb_arrow_dplyr <- function(variables) {\n \n DataMultiTypes |>\n \n to_duckdb() |>\n \n # Filter rows\n filter(\n colInt>2000 & colInt<8000\n ) |>\n # Grouping and aggregation\n group_by(colString) |> \n \n summarise(\n min_colInt = min(colInt, na.rm = TRUE),\n mean_colInt = mean(colInt, na.rm = TRUE),\n mas_colInt = max(colInt, na.rm = TRUE),\n min_colNum = min(colNum, na.rm = TRUE),\n mean_colNum = mean(colNum, na.rm = TRUE),\n max_colNum = max(colNum, na.rm = TRUE)\n ) \n \n}\n\n\n\n\n\n\n\n\n\n\nTip\n\n\n\nOne of the advantages of using the DuckDB engine and dplyr may be to use a feature implemented by DuckDB but not yet by Arrow. We can do the opposite, and return to the Arrow engine with arrow::to_arrow().\nHowever, the benchmark results are clear: SQL queries are by far the fastest! πŸ†\n\n\n\n5.1.1 Results with a R object\n\nmicrobenchmark(\n robject_polars(),\n robject_rbase(),\n robject_dplyr(),\n robject_dt(),\n robject_duckdb_dplyr(),\n robject_duckdb_sql(),\n robject_duckdb_arrow_dplyr(),\n times = 5\n ) \n\nUnit: milliseconds\n expr min lq mean median uq\n robject_polars() 22.2125 24.2582 28.78548 24.4624 36.2487\n robject_rbase() 220.6788 246.8106 247.88984 247.2107 257.8647\n robject_dplyr() 31.4568 32.3749 43.88756 45.0114 54.9415\n robject_dt() 44.0361 46.0946 57.57272 54.2305 67.5367\n robject_duckdb_dplyr() 321.1113 322.1483 332.46270 326.6855 344.4590\n robject_duckdb_sql() 75.9102 78.3098 83.63402 82.7341 89.0440\n robject_duckdb_arrow_dplyr() 245.3665 248.7998 269.75346 254.2880 255.9525\n max neval\n 36.7456 5\n 266.8844 5\n 55.6532 5\n 75.9657 5\n 347.9094 5\n 92.1720 5\n 344.3605 5\n\n\nπŸ‘‰ Conclusion of this little benchmark using R objects already loaded in memory: the fastest to run are polars and dplyr followed closely by data.table. πŸ†πŸ†πŸ†\nThe worst performer is surprisingly duckdb with the dplyr syntax, while duckdb with the SQL language does very well and comes 4th in this ranking." }, { "objectID": "benchmarking.html#from-a-csv-file", "href": "benchmarking.html#from-a-csv-file", "title": "5Β  Benchmarking", "section": "5.2 From a csv file", - "text": "5.2 From a csv file\nFor this comparison, we will use :\n\nFor polars (eager), the pl$read_csv() method\nFor polars (lazy), the pl$scan_csv() method\nFor R base, the read.csv() method\nFor dplyr, the readr::read_csv() method\nFor data.table, the data.table::fread() method\n\n\npolars (eager)polars (lazy)R basedplyrdplyr (Acero)data.table\n\n\n\ncsv_eager_polars <- function() {\n# Reading the csv file (eager mode)\nresult_agg <- pl$read_csv(path = \"Datasets/DataMultiTypes.csv\")$\n # Conversion of 2 columns to Date format\n with_columns(\n pl$col(\"colDate1\")$str$strptime(pl$Date, \"%F %T\", strict = FALSE),\n pl$col(\"colDate2\")$str$strptime(pl$Date, \"%F %T\", strict = FALSE)\n )$\n # Creation of a diff column between 2 dates (in days)\n with_columns(\n (pl$col(\"colDate2\") - pl$col(\"colDate1\"))$dt$days()$alias(\"diff\")\n )$\n # Filter rows\n filter(\n pl$col(\"colInt\")>2000 & pl$col(\"colInt\")<8000\n )$\n # Grouping and aggregation\n groupby(\n \"colString\")$\n agg(\n pl$col(\"colInt\")$min()$alias(\"min_colInt\"),\n pl$col(\"colInt\")$mean()$alias(\"mean_colInt\"),\n pl$col(\"colInt\")$max()$alias(\"max_colInt\"),\n pl$col(\"colNum\")$min()$alias(\"min_colNum\"),\n pl$col(\"colNum\")$mean()$alias(\"mean_colNum\"),\n pl$col(\"colNum\")$max()$alias(\"max_colNum\")\n )\n \n return(result_agg)\n}\ntic()\ncsv_eager_polars()\n\nshape: (3, 7)\nβ”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”\nβ”‚ colString ┆ min_colInt ┆ mean_colInt ┆ max_colInt ┆ min_colNum ┆ mean_colNum ┆ max_colNum β”‚\nβ”‚ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- β”‚\nβ”‚ str ┆ i64 ┆ f64 ┆ i64 ┆ f64 ┆ f64 ┆ f64 β”‚\nβ•žβ•β•β•β•β•β•β•β•β•β•β•β•ͺ════════════β•ͺ═════════════β•ͺ════════════β•ͺ════════════β•ͺ═════════════β•ͺ════════════║\nβ”‚ C ┆ 2001 ┆ 5001.243285 ┆ 7999 ┆ 0.00003 ┆ 0.501472 ┆ 0.999992 β”‚\nβ”‚ A ┆ 2001 ┆ 4998.624945 ┆ 7999 ┆ 0.000038 ┆ 0.498445 ┆ 0.999988 β”‚\nβ”‚ B ┆ 2001 ┆ 5004.31148 ┆ 7999 ┆ 0.000034 ┆ 0.500546 ┆ 0.999986 β”‚\nβ””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜\n\ntoc()\n\n0.29 sec elapsed\n\n\n\n\n\ncsv_lazy_polars <- function() {\n# Reading the csv file (eager mode)\nresult_agg <- pl$scan_csv(path = \"Datasets/DataMultiTypes.csv\")$\n # Conversion of 2 columns to Date format\n with_columns(\n pl$col(\"colDate1\")$str$strptime(pl$Date, \"%F %T\", strict = FALSE),\n pl$col(\"colDate2\")$str$strptime(pl$Date, \"%F %T\", strict = FALSE)\n )$\n # Creation of a diff column between 2 dates (in days)\n with_columns(\n (pl$col(\"colDate2\") - pl$col(\"colDate1\"))$dt$days()$alias(\"diff\")\n )$\n # Filter rows\n filter(\n pl$col(\"colInt\")>2000 & pl$col(\"colInt\")<8000\n )$\n # Grouping and aggregation\n groupby(\n \"colString\")$\n agg(\n pl$col(\"colInt\")$min()$alias(\"min_colInt\"),\n pl$col(\"colInt\")$mean()$alias(\"mean_colInt\"),\n pl$col(\"colInt\")$max()$alias(\"max_colInt\"),\n pl$col(\"colNum\")$min()$alias(\"min_colNum\"),\n pl$col(\"colNum\")$mean()$alias(\"mean_colNum\"),\n pl$col(\"colNum\")$max()$alias(\"max_colNum\")\n )\n \n return(result_agg)\n}\ntic()\ncsv_lazy_polars()$collect()\n\nshape: (3, 7)\nβ”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”\nβ”‚ colString ┆ min_colInt ┆ mean_colInt ┆ max_colInt ┆ min_colNum ┆ mean_colNum ┆ max_colNum β”‚\nβ”‚ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- β”‚\nβ”‚ str ┆ i64 ┆ f64 ┆ i64 ┆ f64 ┆ f64 ┆ f64 β”‚\nβ•žβ•β•β•β•β•β•β•β•β•β•β•β•ͺ════════════β•ͺ═════════════β•ͺ════════════β•ͺ════════════β•ͺ═════════════β•ͺ════════════║\nβ”‚ C ┆ 2001 ┆ 5001.243285 ┆ 7999 ┆ 0.00003 ┆ 0.501472 ┆ 0.999992 β”‚\nβ”‚ B ┆ 2001 ┆ 5004.31148 ┆ 7999 ┆ 0.000034 ┆ 0.500546 ┆ 0.999986 β”‚\nβ”‚ A ┆ 2001 ┆ 4998.624945 ┆ 7999 ┆ 0.000038 ┆ 0.498445 ┆ 0.999988 β”‚\nβ””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜\n\ntoc()\n\n0.098 sec elapsed\n\n\n\n\n\ncsv_rbase <- function() {\n \n # Reading the csv file\n result <- read.csv(\"Datasets/DataMultiTypes.csv\")\n \n # Conversion of 2 columns to Date format\n result$colDate1 <- as.Date(result$colDate1)\n result$colDate2 <- as.Date(result$colDate2)\n \n # Creation of a diff column between 2 dates (in days)\n result$diff <- round(\n as.integer(\n difftime(\n result$colDate2,\n result$colDate1,\n units = \"days\")\n ),\n 0)\n \n # Filter rows\n result <- result[result$colInt>2000 & result$colInt<8000,]\n \n # Grouping and aggregation\n result_agg <- aggregate(cbind(colInt, colNum) ~ colString, \n data = result, \n FUN = function(x) c(mean = mean(x), \n min = min(x), \n max = max(x)))\n \n return(result_agg)\n}\n\ntic()\nres_rbase <- csv_rbase()\ntoc()\n\n10.386 sec elapsed\n\nprint(res_rbase)\n\n colString colInt.mean colInt.min colInt.max colNum.mean colNum.min\n1 A 4998.625 2001.000 7999.000 4.984446e-01 3.794138e-05\n2 B 5004.311 2001.000 7999.000 5.005457e-01 3.385660e-05\n3 C 5001.243 2001.000 7999.000 5.014723e-01 3.045052e-05\n colNum.max\n1 9.999879e-01\n2 9.999863e-01\n3 9.999921e-01\n\n\n\n\n\ncsv_dplyr <- function() {\n \n # Reading the csv file\n result <- readr::read_csv(\"Datasets/DataMultiTypes.csv\", show_col_types = FALSE)\n \n # Conversion of 2 columns to Date format\n result <- result |>\n mutate(\n colDate1 = as.Date(colDate1),\n colDate2 = as.Date(colDate2)\n )\n \n # Creation of a diff column between 2 dates (in days)\n result <- result |> \n mutate(diff = round(as.integer(difftime(colDate2, colDate1, units = \"days\")),0))\n \n # Filter rows\n result <- result |>\n filter(\n colInt>2000 & colInt<8000\n )\n \n # Grouping and aggregation\n result_agg <- result |>\n group_by(colString) |> \n summarise(\n min_colInt = min(colInt),\n mean_colInt = mean(colInt),\n mas_colInt = max(colInt),\n min_colNum = min(colNum),\n mean_colNum = mean(colNum),\n max_colNum = max(colNum)\n )\n \n return(result_agg)\n}\n\ntic()\nres_dplyr <- csv_dplyr()\ntoc()\n\n0.787 sec elapsed\n\nprint(res_dplyr)\n\n# A tibble: 3 Γ— 7\n colString min_colInt mean_colInt mas_colInt min_colNum mean_colNum max_colNum\n <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>\n1 A 2001 4999. 7999 0.0000379 0.498 1.00\n2 B 2001 5004. 7999 0.0000339 0.501 1.00\n3 C 2001 5001. 7999 0.0000305 0.501 1.00\n\n\n\n\n\ncsv_arrow <- function() {\n \n # Reading the csv file\n result <- arrow::read_csv_arrow(\"Datasets/DataMultiTypes.csv\", as_data_frame = FALSE)\n \n # Conversion of 2 columns to Date format\n result <- result |>\n mutate(\n colDate1 = as.Date(colDate1),\n colDate2 = as.Date(colDate2)\n )\n \n # Creation of a diff column between 2 dates (in days)\n result <- result |>\n # difftime(unit = \"days\") is not supported in arrow yet\n mutate(diff = round(as.integer64(difftime(colDate2, colDate1)) %/% (60 * 60 * 24), 0))\n \n # Filter rows\n result <- result |>\n filter(\n colInt>2000 & colInt<8000\n )\n \n # Grouping and aggregation\n result_agg <- result |>\n group_by(colString) |> \n summarise(\n min_colInt = min(colInt),\n mean_colInt = mean(colInt),\n mas_colInt = max(colInt),\n min_colNum = min(colNum),\n mean_colNum = mean(colNum),\n max_colNum = max(colNum)\n ) |>\n collect()\n\n return(result_agg)\n}\n\ntic()\nres_arrow <- csv_arrow()\ntoc()\n\n0.419 sec elapsed\n\nprint(res_arrow)\n\n# A tibble: 3 Γ— 7\n colString min_colInt mean_colInt mas_colInt min_colNum mean_colNum max_colNum\n <chr> <int> <dbl> <int> <dbl> <dbl> <dbl>\n1 A 2001 4999. 7999 0.0000379 0.498 1.00\n2 C 2001 5001. 7999 0.0000305 0.501 1.00\n3 B 2001 5004. 7999 0.0000339 0.501 1.00\n\n\n\n\n\ncsv_dt <- function() {\n \n result_agg <- as.data.table(data.table::fread(\"Datasets/DataMultiTypes.csv\"))[, `:=`(\n \n colDate1 = as.Date(colDate1),\n colDate2 = as.Date(colDate2),\n diff = as.integer(difftime(colDate2, colDate1, units = \"days\"))\n \n)][colInt > 2000 & colInt < 8000, .(\n \n min_colInt = min(colInt),\n mean_colInt = mean(colInt),\n max_colInt = max(colInt),\n min_colNum = min(colNum),\n mean_colNum = mean(colNum),\n max_colNum = max(colNum)\n \n), by = colString]\n \n return(result_agg)\n}\ntic()\ncsv_dt()\n\n colString min_colInt mean_colInt max_colInt min_colNum mean_colNum\n1: B 2001 5004.311 7999 3.385660e-05 0.5005457\n2: C 2001 5001.243 7999 3.045052e-05 0.5014723\n3: A 2001 4998.625 7999 3.794138e-05 0.4984446\n max_colNum\n1: 0.9999863\n2: 0.9999921\n3: 0.9999879\n\ntoc()\n\n0.389 sec elapsed\n\n\n\n\n\n\n\n\n\n\n\nNote\n\n\n\nThe data processing performed is not entirely equivalent, since it includes in addition:\n- for polars (lazy mode), conversion to data.frame R at the end of processing\n- for data.table, conversion to dt format at the start, then conversion to data.frame R at the end of processing\n\n\n\n5.2.1 Results eager vs lazy mode\n\ncsv_bmk <- microbenchmark(\n \"polars (eager) from csv file\" = csv_eager_polars(),\n \"polars (lazy) from csv file\" = csv_lazy_polars()$collect(),\n \"R base - from csv file\" = csv_rbase(),\n \"dplyr - from csv file\" = csv_dplyr(),\n \"dplyr (Acero) - from csv file\" = csv_arrow(),\n \"data.table - from csv file\" = csv_dt(),\n times = 5\n )\ncsv_bmk\n\nUnit: milliseconds\n expr min lq mean median\n polars (eager) from csv file 279.7458 291.2039 300.19328 298.5861\n polars (lazy) from csv file 82.5635 84.0201 89.73178 85.1958\n R base - from csv file 8758.9725 8811.8265 9101.71930 8954.2198\n dplyr - from csv file 607.1868 610.0531 656.87030 645.0672\n dplyr (Acero) - from csv file 244.1663 247.5951 256.30072 253.8460\n data.table - from csv file 259.7881 263.5397 383.01002 335.6116\n uq max neval\n 308.3921 323.0385 5\n 88.3450 108.5345 5\n 9076.5466 9907.0311 5\n 707.6205 714.4239 5\n 266.2006 269.6956 5\n 431.9201 624.1906 5\n\n\nπŸ‘‰ Conclusion of this little benchmark based on csv files: the big winners are polars (eager mode) and dplyr with {arrow}. The results will undoubtedly be even better with polars (lazy mode)… πŸ†πŸ†πŸ†\nTO DO !!!" + "text": "5.2 From a csv file\nFor this comparison, we will use :\n\nFor polars (eager), the pl$read_csv() method\nFor polars (lazy), the pl$scan_csv() method\nFor R base, the read.csv() method\nFor dplyr, the readr::read_csv() method\nFor data.table, the data.table::fread() method\n\n\npolars (eager)polars (lazy)R basedplyrdplyr (Acero)data.table\n\n\n\ncsv_eager_polars <- function() {\n# Reading the csv file (eager mode)\nresult_agg <- pl$read_csv(path = \"Datasets/DataMultiTypes.csv\")$\n # Conversion of 2 columns to Date format\n with_columns(\n pl$col(\"colDate1\")$str$strptime(pl$Date, \"%F %T\", strict = FALSE),\n pl$col(\"colDate2\")$str$strptime(pl$Date, \"%F %T\", strict = FALSE)\n )$\n # Creation of a diff column between 2 dates (in days)\n with_columns(\n (pl$col(\"colDate2\") - pl$col(\"colDate1\"))$dt$days()$alias(\"diff\")\n )$\n # Filter rows\n filter(\n pl$col(\"colInt\")>2000 & pl$col(\"colInt\")<8000\n )$\n # Grouping and aggregation\n groupby(\n \"colString\")$\n agg(\n pl$col(\"colInt\")$min()$alias(\"min_colInt\"),\n pl$col(\"colInt\")$mean()$alias(\"mean_colInt\"),\n pl$col(\"colInt\")$max()$alias(\"max_colInt\"),\n pl$col(\"colNum\")$min()$alias(\"min_colNum\"),\n pl$col(\"colNum\")$mean()$alias(\"mean_colNum\"),\n pl$col(\"colNum\")$max()$alias(\"max_colNum\")\n )\n \n return(result_agg)\n}\ntic()\ncsv_eager_polars()\n\nshape: (3, 7)\nβ”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”\nβ”‚ colString ┆ min_colInt ┆ mean_colInt ┆ max_colInt ┆ min_colNum ┆ mean_colNum ┆ max_colNum β”‚\nβ”‚ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- β”‚\nβ”‚ str ┆ i64 ┆ f64 ┆ i64 ┆ f64 ┆ f64 ┆ f64 β”‚\nβ•žβ•β•β•β•β•β•β•β•β•β•β•β•ͺ════════════β•ͺ═════════════β•ͺ════════════β•ͺ════════════β•ͺ═════════════β•ͺ════════════║\nβ”‚ B ┆ 2001 ┆ 5004.31148 ┆ 7999 ┆ 0.000034 ┆ 0.500546 ┆ 0.999986 β”‚\nβ”‚ A ┆ 2001 ┆ 4998.624945 ┆ 7999 ┆ 0.000038 ┆ 0.498445 ┆ 0.999988 β”‚\nβ”‚ C ┆ 2001 ┆ 5001.243285 ┆ 7999 ┆ 0.00003 ┆ 0.501472 ┆ 0.999992 β”‚\nβ””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜\n\ntoc()\n\n0.263 sec elapsed\n\n\n\n\n\ncsv_lazy_polars <- function() {\n# Reading the csv file (eager mode)\nresult_agg <- pl$scan_csv(path = \"Datasets/DataMultiTypes.csv\")$\n # Conversion of 2 columns to Date format\n with_columns(\n pl$col(\"colDate1\")$str$strptime(pl$Date, \"%F %T\", strict = FALSE),\n pl$col(\"colDate2\")$str$strptime(pl$Date, \"%F %T\", strict = FALSE)\n )$\n # Creation of a diff column between 2 dates (in days)\n with_columns(\n (pl$col(\"colDate2\") - pl$col(\"colDate1\"))$dt$days()$alias(\"diff\")\n )$\n # Filter rows\n filter(\n pl$col(\"colInt\")>2000 & pl$col(\"colInt\")<8000\n )$\n # Grouping and aggregation\n groupby(\n \"colString\")$\n agg(\n pl$col(\"colInt\")$min()$alias(\"min_colInt\"),\n pl$col(\"colInt\")$mean()$alias(\"mean_colInt\"),\n pl$col(\"colInt\")$max()$alias(\"max_colInt\"),\n pl$col(\"colNum\")$min()$alias(\"min_colNum\"),\n pl$col(\"colNum\")$mean()$alias(\"mean_colNum\"),\n pl$col(\"colNum\")$max()$alias(\"max_colNum\")\n )\n \n return(result_agg)\n}\ntic()\ncsv_lazy_polars()$collect()\n\nshape: (3, 7)\nβ”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”\nβ”‚ colString ┆ min_colInt ┆ mean_colInt ┆ max_colInt ┆ min_colNum ┆ mean_colNum ┆ max_colNum β”‚\nβ”‚ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- β”‚\nβ”‚ str ┆ i64 ┆ f64 ┆ i64 ┆ f64 ┆ f64 ┆ f64 β”‚\nβ•žβ•β•β•β•β•β•β•β•β•β•β•β•ͺ════════════β•ͺ═════════════β•ͺ════════════β•ͺ════════════β•ͺ═════════════β•ͺ════════════║\nβ”‚ B ┆ 2001 ┆ 5004.31148 ┆ 7999 ┆ 0.000034 ┆ 0.500546 ┆ 0.999986 β”‚\nβ”‚ C ┆ 2001 ┆ 5001.243285 ┆ 7999 ┆ 0.00003 ┆ 0.501472 ┆ 0.999992 β”‚\nβ”‚ A ┆ 2001 ┆ 4998.624945 ┆ 7999 ┆ 0.000038 ┆ 0.498445 ┆ 0.999988 β”‚\nβ””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜\n\ntoc()\n\n0.084 sec elapsed\n\n\n\n\n\ncsv_rbase <- function() {\n \n # Reading the csv file\n result <- read.csv(\"Datasets/DataMultiTypes.csv\")\n \n # Conversion of 2 columns to Date format\n result$colDate1 <- as.Date(result$colDate1)\n result$colDate2 <- as.Date(result$colDate2)\n \n # Creation of a diff column between 2 dates (in days)\n result$diff <- round(\n as.integer(\n difftime(\n result$colDate2,\n result$colDate1,\n units = \"days\")\n ),\n 0)\n \n # Filter rows\n result <- result[result$colInt>2000 & result$colInt<8000,]\n \n # Grouping and aggregation\n result_agg <- aggregate(cbind(colInt, colNum) ~ colString, \n data = result, \n FUN = function(x) c(mean = mean(x), \n min = min(x), \n max = max(x)))\n \n return(result_agg)\n}\n\ntic()\nres_rbase <- csv_rbase()\ntoc()\n\n9.506 sec elapsed\n\nprint(res_rbase)\n\n colString colInt.mean colInt.min colInt.max colNum.mean colNum.min\n1 A 4998.625 2001.000 7999.000 4.984446e-01 3.794138e-05\n2 B 5004.311 2001.000 7999.000 5.005457e-01 3.385660e-05\n3 C 5001.243 2001.000 7999.000 5.014723e-01 3.045052e-05\n colNum.max\n1 9.999879e-01\n2 9.999863e-01\n3 9.999921e-01\n\n\n\n\n\ncsv_dplyr <- function() {\n \n # Reading the csv file\n result <- readr::read_csv(\"Datasets/DataMultiTypes.csv\", show_col_types = FALSE)\n \n # Conversion of 2 columns to Date format\n result <- result |>\n mutate(\n colDate1 = as.Date(colDate1),\n colDate2 = as.Date(colDate2)\n )\n \n # Creation of a diff column between 2 dates (in days)\n result <- result |> \n mutate(diff = round(as.integer(difftime(colDate2, colDate1, units = \"days\")),0))\n \n # Filter rows\n result <- result |>\n filter(\n colInt>2000 & colInt<8000\n )\n \n # Grouping and aggregation\n result_agg <- result |>\n group_by(colString) |> \n summarise(\n min_colInt = min(colInt),\n mean_colInt = mean(colInt),\n mas_colInt = max(colInt),\n min_colNum = min(colNum),\n mean_colNum = mean(colNum),\n max_colNum = max(colNum)\n )\n \n return(result_agg)\n}\n\ntic()\nres_dplyr <- csv_dplyr()\ntoc()\n\n0.677 sec elapsed\n\nprint(res_dplyr)\n\n# A tibble: 3 Γ— 7\n colString min_colInt mean_colInt mas_colInt min_colNum mean_colNum max_colNum\n <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>\n1 A 2001 4999. 7999 0.0000379 0.498 1.00\n2 B 2001 5004. 7999 0.0000339 0.501 1.00\n3 C 2001 5001. 7999 0.0000305 0.501 1.00\n\n\n\n\n\ncsv_arrow <- function() {\n \n # Reading the csv file\n result <- arrow::read_csv_arrow(\"Datasets/DataMultiTypes.csv\", as_data_frame = FALSE)\n \n # Conversion of 2 columns to Date format\n result <- result |>\n mutate(\n colDate1 = as.Date(colDate1),\n colDate2 = as.Date(colDate2)\n )\n \n # Creation of a diff column between 2 dates (in days)\n result <- result |>\n # difftime(unit = \"days\") is not supported in arrow yet\n mutate(diff = round(as.integer64(difftime(colDate2, colDate1)) %/% (60 * 60 * 24), 0))\n \n # Filter rows\n result <- result |>\n filter(\n colInt>2000 & colInt<8000\n )\n \n # Grouping and aggregation\n result_agg <- result |>\n group_by(colString) |> \n summarise(\n min_colInt = min(colInt),\n mean_colInt = mean(colInt),\n mas_colInt = max(colInt),\n min_colNum = min(colNum),\n mean_colNum = mean(colNum),\n max_colNum = max(colNum)\n ) |>\n collect()\n\n return(result_agg)\n}\n\ntic()\nres_arrow <- csv_arrow()\ntoc()\n\n0.365 sec elapsed\n\nprint(res_arrow)\n\n# A tibble: 3 Γ— 7\n colString min_colInt mean_colInt mas_colInt min_colNum mean_colNum max_colNum\n <chr> <int> <dbl> <int> <dbl> <dbl> <dbl>\n1 A 2001 4999. 7999 0.0000379 0.498 1.00\n2 C 2001 5001. 7999 0.0000305 0.501 1.00\n3 B 2001 5004. 7999 0.0000339 0.501 1.00\n\n\n\n\n\ncsv_dt <- function() {\n \n result_agg <- as.data.table(data.table::fread(\"Datasets/DataMultiTypes.csv\"))[, `:=`(\n \n colDate1 = as.Date(colDate1),\n colDate2 = as.Date(colDate2),\n diff = as.integer(difftime(colDate2, colDate1, units = \"days\"))\n \n)][colInt > 2000 & colInt < 8000, .(\n \n min_colInt = min(colInt),\n mean_colInt = mean(colInt),\n max_colInt = max(colInt),\n min_colNum = min(colNum),\n mean_colNum = mean(colNum),\n max_colNum = max(colNum)\n \n), by = colString]\n \n return(result_agg)\n}\ntic()\ncsv_dt()\n\n colString min_colInt mean_colInt max_colInt min_colNum mean_colNum\n1: B 2001 5004.311 7999 3.385660e-05 0.5005457\n2: C 2001 5001.243 7999 3.045052e-05 0.5014723\n3: A 2001 4998.625 7999 3.794138e-05 0.4984446\n max_colNum\n1: 0.9999863\n2: 0.9999921\n3: 0.9999879\n\ntoc()\n\n0.358 sec elapsed\n\n\n\n\n\n\n\n\n\n\n\nNote\n\n\n\nThe data processing performed is not entirely equivalent, since it includes in addition:\n- for polars (lazy mode), conversion to data.frame R at the end of processing\n- for data.table, conversion to dt format at the start, then conversion to data.frame R at the end of processing\n\n\n\n5.2.1 Results eager vs lazy mode\n\ncsv_bmk <- microbenchmark(\n \"polars (eager) from csv file\" = csv_eager_polars(),\n \"polars (lazy) from csv file\" = csv_lazy_polars()$collect(),\n \"R base - from csv file\" = csv_rbase(),\n \"dplyr - from csv file\" = csv_dplyr(),\n \"dplyr (Acero) - from csv file\" = csv_arrow(),\n \"data.table - from csv file\" = csv_dt(),\n times = 5\n )\ncsv_bmk\n\nUnit: milliseconds\n expr min lq mean median\n polars (eager) from csv file 259.0630 260.2271 273.28366 268.1899\n polars (lazy) from csv file 79.8473 83.4839 87.61572 84.9361\n R base - from csv file 7954.4349 8392.2098 8445.26252 8438.6395\n dplyr - from csv file 523.7279 553.9853 594.50394 567.0172\n dplyr (Acero) - from csv file 209.4744 210.2422 216.63242 218.0538\n data.table - from csv file 262.3306 263.1075 351.57228 324.2653\n uq max neval\n 284.1632 294.7751 5\n 86.7033 103.1080 5\n 8601.4964 8839.5320 5\n 656.8072 670.9821 5\n 222.2487 223.1430 5\n 402.7321 505.4259 5\n\n\nπŸ‘‰ Conclusion of this little benchmark based on csv files: the big winners are polars (eager mode) and dplyr with {arrow}. The results will undoubtedly be even better with polars (lazy mode)… πŸ†πŸ†πŸ†\nTO DO !!!" }, { "objectID": "benchmarking.html#from-an-unique-parquet-file", "href": "benchmarking.html#from-an-unique-parquet-file", "title": "5Β  Benchmarking", "section": "5.3 From an unique parquet file", - "text": "5.3 From an unique parquet file\nFor this comparison on unique parquet file, we will use :\n\nFor polars (lazy), the pl$scan_parquet() method\nFor arrow (eager), the arrow::read_parquet() method\nFor arrow (lazy), the arrow::open_dataset() method\nFor Duckdb and SQL, the arrow::read_parquet() and DBI::dbGetQuery() methods\n\n\n\n\n\n\n\nNote\n\n\n\nWith arrow, you can use the following verbs from the tidyverse to do transformations on your tables.\n\n\n\npolars (lazy)arrow (eager)arrow (lazy)Duckdb and SQL\n\n\n\nparquet_polars_lazy <- function(variables) {\n \n result <- pl$scan_parquet(file = \"Datasets/DataMultiTypes.parquet\")$\n # Conversion of 2 columns to Date format\n with_columns(\n pl$col(\"colDate1\")$str$strptime(pl$Date, \"%F %T\", strict = FALSE),\n pl$col(\"colDate2\")$str$strptime(pl$Date, \"%F %T\", strict = FALSE)\n )$\n # Filter rows\n filter(\n pl$col(\"colInt\")>2000 & pl$col(\"colInt\")<8000\n )$\n # Grouping and aggregation\n groupby(\n \"colString\")$\n agg(\n pl$col(\"colInt\")$min()$alias(\"min_colInt\"),\n pl$col(\"colInt\")$mean()$alias(\"mean_colInt\"),\n pl$col(\"colInt\")$max()$alias(\"max_colInt\"),\n pl$col(\"colNum\")$min()$alias(\"min_colNum\"),\n pl$col(\"colNum\")$mean()$alias(\"mean_colNum\"),\n pl$col(\"colNum\")$max()$alias(\"max_colNum\")\n )\n \n return(result)\n}\ntic()\nparquet_polars_lazy()$collect()$to_data_frame()\n\n colString min_colInt mean_colInt max_colInt min_colNum mean_colNum\n1 B 2001 5004.311 7999 3.385660e-05 0.5005457\n2 A 2001 4998.625 7999 3.794138e-05 0.4984446\n3 C 2001 5001.243 7999 3.045052e-05 0.5014723\n max_colNum\n1 0.9999863\n2 0.9999879\n3 0.9999921\n\ntoc()\n\n0.046 sec elapsed\n\n\n\n\n\narrow_eager <- function(variables) {\n \n result <- arrow::read_parquet(\"Datasets/DataMultiTypes.parquet\") |>\n \n mutate(\n # Conversion of 2 columns to Date format\n colDate1 = as.Date(colDate1),\n colDate2 = as.Date(colDate2)\n ) |>\n # Filter rows\n filter(\n colInt>2000 & colInt<8000\n ) |>\n # Grouping and aggregation\n group_by(colString) |> \n summarise(\n min_colInt = min(colInt),\n mean_colInt = mean(colInt),\n mas_colInt = max(colInt),\n min_colNum = min(colNum),\n mean_colNum = mean(colNum),\n max_colNum = max(colNum)\n )\n \n return(result)\n \n}\ntic()\narrow_eager()\n\n# A tibble: 3 Γ— 7\n colString min_colInt mean_colInt mas_colInt min_colNum mean_colNum max_colNum\n <chr> <int> <dbl> <int> <dbl> <dbl> <dbl>\n1 A 2001 4999. 7999 0.0000379 0.498 1.00\n2 B 2001 5004. 7999 0.0000339 0.501 1.00\n3 C 2001 5001. 7999 0.0000305 0.501 1.00\n\ntoc()\n\n0.142 sec elapsed\n\n\n\n\n\narrow_lazy <- function(variables) {\n \n result <- arrow::open_dataset(\"Datasets/DataMultiTypes.parquet\") |>\n \n mutate(\n # Conversion of 2 columns to Date format\n colDate1 = as.Date(colDate1),\n colDate2 = as.Date(colDate2)\n ) |>\n # Filter rows\n filter(\n colInt>2000 & colInt<8000\n ) |>\n # Grouping and aggregation\n group_by(colString) |> \n summarise(\n min_colInt = min(colInt),\n mean_colInt = mean(colInt),\n mas_colInt = max(colInt),\n min_colNum = min(colNum),\n mean_colNum = mean(colNum),\n max_colNum = max(colNum)\n )\n \n return(result)\n \n}\ntic()\narrow_lazy() |> collect()\n\n# A tibble: 3 Γ— 7\n colString min_colInt mean_colInt mas_colInt min_colNum mean_colNum max_colNum\n <chr> <int> <dbl> <int> <dbl> <dbl> <dbl>\n1 B 2001 5004. 7999 0.0000339 0.501 1.00\n2 C 2001 5001. 7999 0.0000305 0.501 1.00\n3 A 2001 4999. 7999 0.0000379 0.498 1.00\n\ntoc()\n\n0.208 sec elapsed\n\n\n\n\n\nparquet_duckdb_sql <- function(variables) {\n \n con <- dbConnect(duckdb::duckdb())\n \n result <- dbGetQuery(\n con, \n \"SELECT colString,\n MIN(colInt) AS min_colInt,\n AVG(colInt) AS mean_colInt,\n MAX(colInt) AS max_colInt,\n MIN(colNum) AS min_colNum,\n AVG(colNum) AS mean_colNum,\n MAX(colNum) AS max_colNum\n FROM (\n SELECT colString,\n colInt,\n colNum\n FROM read_parquet('Datasets/DataMultiTypes.parquet')\n WHERE colInt > 2000 AND colInt < 8000\n) AS filtered_data\nGROUP BY colString;\")\n \n dbDisconnect(con, shutdown=TRUE)\n \n return(result)\n}\ntic()\nparquet_duckdb_sql()\n\n colString min_colInt mean_colInt max_colInt min_colNum mean_colNum\n1 B 2001 5004.311 7999 3.385660e-05 0.5005457\n2 C 2001 5001.243 7999 3.045052e-05 0.5014723\n3 A 2001 4998.625 7999 3.794138e-05 0.4984446\n max_colNum\n1 0.9999863\n2 0.9999921\n3 0.9999879\n\ntoc()\n\n0.095 sec elapsed\n\n\n\n\n\n\n5.3.1 Results for unique parquet file\n\nunique_parquet_bmk <- microbenchmark(\n \"polars (lazy) - from unique parquet file\" = parquet_polars_lazy()$collect()$to_data_frame(),\n \"arrow (eager) - from unique parquet file\" = arrow_eager(),\n \"arrow (lazy) - from unique parquet file\" = arrow_lazy() |> collect(),\n \"Duckdb and SQL - from unique parquet file\" = parquet_duckdb_sql(),\n times = 5\n )\nprint(unique_parquet_bmk)\n\nUnit: milliseconds\n expr min lq mean median\n polars (lazy) - from unique parquet file 38.7847 39.6757 44.1609 41.6334\n arrow (eager) - from unique parquet file 104.4476 108.6151 116.7358 109.0899\n arrow (lazy) - from unique parquet file 143.5093 144.0727 150.9942 145.7879\n Duckdb and SQL - from unique parquet file 89.8875 90.6194 92.6642 93.4881\n uq max neval\n 42.0520 58.6587 5\n 128.3451 133.1814 5\n 154.7482 166.8528 5\n 93.9643 95.3617 5\n\n\nπŸ‘‰ Conclusion of this little benchmark based on unique parquet files: the big winner is polars (lazy mode) ! πŸ†πŸ†πŸ†" + "text": "5.3 From an unique parquet file\nFor this comparison on unique parquet file, we will use :\n\nFor polars (lazy), the pl$scan_parquet() method\nFor arrow (eager), the arrow::read_parquet() method\nFor arrow (lazy), the arrow::open_dataset() method\nFor Duckdb and SQL, the arrow::read_parquet() and DBI::dbGetQuery() methods\n\n\n\n\n\n\n\nNote\n\n\n\nWith arrow, you can use the following verbs from the tidyverse to do transformations on your tables.\n\n\n\npolars (lazy)arrow (eager)arrow (lazy)Duckdb and SQL\n\n\n\nparquet_polars_lazy <- function(variables) {\n \n result <- pl$scan_parquet(file = \"Datasets/DataMultiTypes.parquet\")$\n # Conversion of 2 columns to Date format\n with_columns(\n pl$col(\"colDate1\")$str$strptime(pl$Date, \"%F %T\", strict = FALSE),\n pl$col(\"colDate2\")$str$strptime(pl$Date, \"%F %T\", strict = FALSE)\n )$\n # Filter rows\n filter(\n pl$col(\"colInt\")>2000 & pl$col(\"colInt\")<8000\n )$\n # Grouping and aggregation\n groupby(\n \"colString\")$\n agg(\n pl$col(\"colInt\")$min()$alias(\"min_colInt\"),\n pl$col(\"colInt\")$mean()$alias(\"mean_colInt\"),\n pl$col(\"colInt\")$max()$alias(\"max_colInt\"),\n pl$col(\"colNum\")$min()$alias(\"min_colNum\"),\n pl$col(\"colNum\")$mean()$alias(\"mean_colNum\"),\n pl$col(\"colNum\")$max()$alias(\"max_colNum\")\n )\n \n return(result)\n}\ntic()\nparquet_polars_lazy()$collect()$to_data_frame()\n\n colString min_colInt mean_colInt max_colInt min_colNum mean_colNum\n1 A 2001 4998.625 7999 3.794138e-05 0.4984446\n2 C 2001 5001.243 7999 3.045052e-05 0.5014723\n3 B 2001 5004.311 7999 3.385660e-05 0.5005457\n max_colNum\n1 0.9999879\n2 0.9999921\n3 0.9999863\n\ntoc()\n\n0.043 sec elapsed\n\n\n\n\n\narrow_eager <- function(variables) {\n \n result <- arrow::read_parquet(\"Datasets/DataMultiTypes.parquet\") |>\n \n mutate(\n # Conversion of 2 columns to Date format\n colDate1 = as.Date(colDate1),\n colDate2 = as.Date(colDate2)\n ) |>\n # Filter rows\n filter(\n colInt>2000 & colInt<8000\n ) |>\n # Grouping and aggregation\n group_by(colString) |> \n summarise(\n min_colInt = min(colInt),\n mean_colInt = mean(colInt),\n mas_colInt = max(colInt),\n min_colNum = min(colNum),\n mean_colNum = mean(colNum),\n max_colNum = max(colNum)\n )\n \n return(result)\n \n}\ntic()\narrow_eager()\n\n# A tibble: 3 Γ— 7\n colString min_colInt mean_colInt mas_colInt min_colNum mean_colNum max_colNum\n <chr> <int> <dbl> <int> <dbl> <dbl> <dbl>\n1 A 2001 4999. 7999 0.0000379 0.498 1.00\n2 B 2001 5004. 7999 0.0000339 0.501 1.00\n3 C 2001 5001. 7999 0.0000305 0.501 1.00\n\ntoc()\n\n0.131 sec elapsed\n\n\n\n\n\narrow_lazy <- function(variables) {\n \n result <- arrow::open_dataset(\"Datasets/DataMultiTypes.parquet\") |>\n \n mutate(\n # Conversion of 2 columns to Date format\n colDate1 = as.Date(colDate1),\n colDate2 = as.Date(colDate2)\n ) |>\n # Filter rows\n filter(\n colInt>2000 & colInt<8000\n ) |>\n # Grouping and aggregation\n group_by(colString) |> \n summarise(\n min_colInt = min(colInt),\n mean_colInt = mean(colInt),\n mas_colInt = max(colInt),\n min_colNum = min(colNum),\n mean_colNum = mean(colNum),\n max_colNum = max(colNum)\n )\n \n return(result)\n \n}\ntic()\narrow_lazy() |> collect()\n\n# A tibble: 3 Γ— 7\n colString min_colInt mean_colInt mas_colInt min_colNum mean_colNum max_colNum\n <chr> <int> <dbl> <int> <dbl> <dbl> <dbl>\n1 B 2001 5004. 7999 0.0000339 0.501 1.00\n2 C 2001 5001. 7999 0.0000305 0.501 1.00\n3 A 2001 4999. 7999 0.0000379 0.498 1.00\n\ntoc()\n\n0.156 sec elapsed\n\n\n\n\n\nparquet_duckdb_sql <- function(variables) {\n \n con <- dbConnect(duckdb::duckdb())\n \n result <- dbGetQuery(\n con, \n \"SELECT colString,\n MIN(colInt) AS min_colInt,\n AVG(colInt) AS mean_colInt,\n MAX(colInt) AS max_colInt,\n MIN(colNum) AS min_colNum,\n AVG(colNum) AS mean_colNum,\n MAX(colNum) AS max_colNum\n FROM (\n SELECT colString,\n colInt,\n colNum\n FROM read_parquet('Datasets/DataMultiTypes.parquet')\n WHERE colInt > 2000 AND colInt < 8000\n) AS filtered_data\nGROUP BY colString;\")\n \n dbDisconnect(con, shutdown=TRUE)\n \n return(result)\n}\ntic()\nparquet_duckdb_sql()\n\n colString min_colInt mean_colInt max_colInt min_colNum mean_colNum\n1 B 2001 5004.311 7999 3.385660e-05 0.5005457\n2 C 2001 5001.243 7999 3.045052e-05 0.5014723\n3 A 2001 4998.625 7999 3.794138e-05 0.4984446\n max_colNum\n1 0.9999863\n2 0.9999921\n3 0.9999879\n\ntoc()\n\n0.09 sec elapsed\n\n\n\n\n\n\n5.3.1 Results for unique parquet file\n\nunique_parquet_bmk <- microbenchmark(\n \"polars (lazy) - from unique parquet file\" = parquet_polars_lazy()$collect()$to_data_frame(),\n \"arrow (eager) - from unique parquet file\" = arrow_eager(),\n \"arrow (lazy) - from unique parquet file\" = arrow_lazy() |> collect(),\n \"Duckdb and SQL - from unique parquet file\" = parquet_duckdb_sql(),\n times = 5\n )\nprint(unique_parquet_bmk)\n\nUnit: milliseconds\n expr min lq mean median\n polars (lazy) - from unique parquet file 38.8066 39.0678 43.52496 39.6750\n arrow (eager) - from unique parquet file 96.0868 97.3807 111.94214 109.0585\n arrow (lazy) - from unique parquet file 116.7953 118.9454 128.50826 128.2780\n Duckdb and SQL - from unique parquet file 85.2885 87.5746 88.20928 88.3278\n uq max neval\n 40.0819 59.9935 5\n 118.3866 138.7981 5\n 134.7467 143.7759 5\n 88.9761 90.8794 5\n\n\nπŸ‘‰ Conclusion of this little benchmark based on unique parquet files: the big winner is polars (lazy mode) ! πŸ†πŸ†πŸ†" }, { "objectID": "benchmarking.html#from-a-partitioned-parquet-file", "href": "benchmarking.html#from-a-partitioned-parquet-file", "title": "5Β  Benchmarking", "section": "5.4 From a partitioned parquet file", - "text": "5.4 From a partitioned parquet file\nLet’s now look at how to perform queries on partitioned files.\nThe structure of partitioned files on the disk is as follows:\n\nfs::dir_tree(path = \"Datasets/DataMultiTypes/\")\n\nDatasets/DataMultiTypes/\nβ”œβ”€β”€ colFactor=High\nβ”‚ └── part-0.parquet\nβ”œβ”€β”€ colFactor=Low\nβ”‚ └── part-0.parquet\n└── colFactor=Medium\n └── part-0.parquet\n\n\nFor this comparison, we will use :\n\nFor arrow (lazy), the arrow::open_dataset() method\nFor dplyr (duckdb), the DBI::dbConnect, dplyr::tbl() and arrow::read_parquet() methods\nFor polars (lazy), the pl$scan_parquet() method\n\n\narrow (lazy)dplyr (duckdb)polars (lazy)\n\n\n\npartitioned_parquet_arrow_lazy <- function(variables) {\n \n result <- arrow::open_dataset(\n \"Datasets/DataMultiTypes/\",\n partitioning = arrow::schema(colFactor = arrow::utf8())) |>\n \n mutate(\n # Conversion of 2 columns to Date format\n colDate1 = as.Date(colDate1),\n colDate2 = as.Date(colDate2)\n ) |>\n # Filter rows\n filter(\n colInt>2000 & colInt<8000\n ) |>\n # Grouping and aggregation\n group_by(colString) |> \n summarise(\n min_colInt = min(colInt),\n mean_colInt = mean(colInt),\n mas_colInt = max(colInt),\n min_colNum = min(colNum),\n mean_colNum = mean(colNum),\n max_colNum = max(colNum)\n ) |> \n collect()\n \n return(result)\n \n}\ntic()\npartitioned_parquet_arrow_lazy() \n\n# A tibble: 3 Γ— 7\n colString min_colInt mean_colInt mas_colInt min_colNum mean_colNum max_colNum\n <chr> <int> <dbl> <int> <dbl> <dbl> <dbl>\n1 B 2001 5004. 7999 0.0000339 0.501 1.00\n2 C 2001 5001. 7999 0.0000305 0.501 1.00\n3 A 2001 4999. 7999 0.0000379 0.498 1.00\n\ntoc()\n\n0.201 sec elapsed\n\n\n\n\n\n# library(dbplyr)\n\npartitioned_parquet_dplyr_duckdb <- function(variables) {\n \n con <- DBI::dbConnect(duckdb::duckdb())\n \n result <- tbl(con, \"read_parquet('Datasets/DataMultiTypes/*/*.parquet', hive_partitioning=1)\") |>\n \n mutate(\n # Conversion of 2 columns to Date format\n colDate1 = as.Date(colDate1),\n colDate2 = as.Date(colDate2)\n ) |>\n # Filter rows\n filter(\n colInt>2000 & colInt<8000\n ) |>\n # Grouping and aggregation\n group_by(colString) |> \n summarise(\n min_colInt = min(colInt, na.rm = TRUE),\n mean_colInt = mean(colInt, na.rm = TRUE),\n mas_colInt = max(colInt, na.rm = TRUE),\n min_colNum = min(colNum, na.rm = TRUE),\n mean_colNum = mean(colNum, na.rm = TRUE),\n max_colNum = max(colNum, na.rm = TRUE)\n ) |> \n collect()\n \n DBI::dbDisconnect(con)\n return(result)\n}\ntic()\npartitioned_parquet_dplyr_duckdb() \n\n# A tibble: 3 Γ— 7\n colString min_colInt mean_colInt mas_colInt min_colNum mean_colNum max_colNum\n <chr> <int> <dbl> <int> <dbl> <dbl> <dbl>\n1 B 2001 5004. 7999 0.0000339 0.501 1.00\n2 C 2001 5001. 7999 0.0000305 0.501 1.00\n3 A 2001 4999. 7999 0.0000379 0.498 1.00\n\ntoc()\n\n0.526 sec elapsed\n\n\n\n\n\npartitioned_parquet_polars_lazy <- function(variables) {\n \n result <- pl$scan_parquet(file = \"Datasets/DataMultiTypes.parquet\")$\n # Conversion of 2 columns to Date format\n with_columns(\n pl$col(\"colDate1\")$str$strptime(pl$Date, \"%F %T\", strict = FALSE),\n pl$col(\"colDate2\")$str$strptime(pl$Date, \"%F %T\", strict = FALSE)\n )$\n # Filter rows\n filter(\n pl$col(\"colInt\")>2000 & pl$col(\"colInt\")<8000\n )$\n # Grouping and aggregation\n groupby(\n \"colString\")$\n agg(\n pl$col(\"colInt\")$min()$alias(\"min_colInt\"),\n pl$col(\"colInt\")$mean()$alias(\"mean_colInt\"),\n pl$col(\"colInt\")$max()$alias(\"max_colInt\"),\n pl$col(\"colNum\")$min()$alias(\"min_colNum\"),\n pl$col(\"colNum\")$mean()$alias(\"mean_colNum\"),\n pl$col(\"colNum\")$max()$alias(\"max_colNum\")\n )$collect()\n \n return(result)\n}\ntic()\npartitioned_parquet_polars_lazy()\n\nshape: (3, 7)\nβ”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”\nβ”‚ colString ┆ min_colInt ┆ mean_colInt ┆ max_colInt ┆ min_colNum ┆ mean_colNum ┆ max_colNum β”‚\nβ”‚ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- β”‚\nβ”‚ str ┆ i32 ┆ f64 ┆ i32 ┆ f64 ┆ f64 ┆ f64 β”‚\nβ•žβ•β•β•β•β•β•β•β•β•β•β•β•ͺ════════════β•ͺ═════════════β•ͺ════════════β•ͺ════════════β•ͺ═════════════β•ͺ════════════║\nβ”‚ C ┆ 2001 ┆ 5001.243285 ┆ 7999 ┆ 0.00003 ┆ 0.501472 ┆ 0.999992 β”‚\nβ”‚ A ┆ 2001 ┆ 4998.624945 ┆ 7999 ┆ 0.000038 ┆ 0.498445 ┆ 0.999988 β”‚\nβ”‚ B ┆ 2001 ┆ 5004.31148 ┆ 7999 ┆ 0.000034 ┆ 0.500546 ┆ 0.999986 β”‚\nβ””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜\n\ntoc()\n\n0.042 sec elapsed\n\n\n\n\n\n\n5.4.1 Results for partitioned parquet files\n\npartitioned_parquet_bmk <- microbenchmark(\n \"arrow (lazy) - from partitioned parquet file\" = partitioned_parquet_arrow_lazy(),\n \"dplyr (duckdb) - from partitioned parquet file\" = partitioned_parquet_dplyr_duckdb(),\n \"polars (lazy) - from partitioned parquet file\" = partitioned_parquet_polars_lazy()$to_data_frame(),\n times = 5\n )\nprint(partitioned_parquet_bmk)\n\nUnit: milliseconds\n expr min lq mean\n arrow (lazy) - from partitioned parquet file 149.9898 161.4222 166.98852\n dplyr (duckdb) - from partitioned parquet file 463.1010 473.2102 481.29544\n polars (lazy) - from partitioned parquet file 38.9680 39.6177 44.81158\n median uq max neval\n 163.7782 179.0058 180.7466 5\n 474.0380 491.8206 504.3074 5\n 39.6533 39.9143 65.9046 5\n\n\nπŸ‘‰ Conclusion of this little benchmark based on partitioned parquet files: as for unique parquet files, the big winner is polars (lazy mode) ! πŸ†πŸ†πŸ†" + "text": "5.4 From a partitioned parquet file\nLet’s now look at how to perform queries on partitioned files.\nThe structure of partitioned files on the disk is as follows:\n\nfs::dir_tree(path = \"Datasets/DataMultiTypes/\")\n\nDatasets/DataMultiTypes/\nβ”œβ”€β”€ colFactor=High\nβ”‚ └── part-0.parquet\nβ”œβ”€β”€ colFactor=Low\nβ”‚ └── part-0.parquet\n└── colFactor=Medium\n └── part-0.parquet\n\n\nFor this comparison, we will use :\n\nFor arrow (lazy), the arrow::open_dataset() method\nFor dplyr (duckdb), the DBI::dbConnect, dplyr::tbl() and arrow::read_parquet() methods\nFor polars (lazy), the pl$scan_parquet() method\n\n\narrow (lazy)dplyr (duckdb)polars (lazy)\n\n\n\npartitioned_parquet_arrow_lazy <- function(variables) {\n \n result <- arrow::open_dataset(\n \"Datasets/DataMultiTypes/\",\n partitioning = arrow::schema(colFactor = arrow::utf8())) |>\n \n mutate(\n # Conversion of 2 columns to Date format\n colDate1 = as.Date(colDate1),\n colDate2 = as.Date(colDate2)\n ) |>\n # Filter rows\n filter(\n colInt>2000 & colInt<8000\n ) |>\n # Grouping and aggregation\n group_by(colString) |> \n summarise(\n min_colInt = min(colInt),\n mean_colInt = mean(colInt),\n mas_colInt = max(colInt),\n min_colNum = min(colNum),\n mean_colNum = mean(colNum),\n max_colNum = max(colNum)\n ) |> \n collect()\n \n return(result)\n \n}\ntic()\npartitioned_parquet_arrow_lazy() \n\n# A tibble: 3 Γ— 7\n colString min_colInt mean_colInt mas_colInt min_colNum mean_colNum max_colNum\n <chr> <int> <dbl> <int> <dbl> <dbl> <dbl>\n1 C 2001 5001. 7999 0.0000305 0.501 1.00\n2 B 2001 5004. 7999 0.0000339 0.501 1.00\n3 A 2001 4999. 7999 0.0000379 0.498 1.00\n\ntoc()\n\n0.157 sec elapsed\n\n\n\n\n\n# library(dbplyr)\n\npartitioned_parquet_dplyr_duckdb <- function(variables) {\n \n con <- DBI::dbConnect(duckdb::duckdb())\n \n result <- tbl(con, \"read_parquet('Datasets/DataMultiTypes/*/*.parquet', hive_partitioning=1)\") |>\n \n mutate(\n # Conversion of 2 columns to Date format\n colDate1 = as.Date(colDate1),\n colDate2 = as.Date(colDate2)\n ) |>\n # Filter rows\n filter(\n colInt>2000 & colInt<8000\n ) |>\n # Grouping and aggregation\n group_by(colString) |> \n summarise(\n min_colInt = min(colInt, na.rm = TRUE),\n mean_colInt = mean(colInt, na.rm = TRUE),\n mas_colInt = max(colInt, na.rm = TRUE),\n min_colNum = min(colNum, na.rm = TRUE),\n mean_colNum = mean(colNum, na.rm = TRUE),\n max_colNum = max(colNum, na.rm = TRUE)\n ) |> \n collect()\n \n DBI::dbDisconnect(con)\n return(result)\n}\ntic()\npartitioned_parquet_dplyr_duckdb() \n\n# A tibble: 3 Γ— 7\n colString min_colInt mean_colInt mas_colInt min_colNum mean_colNum max_colNum\n <chr> <int> <dbl> <int> <dbl> <dbl> <dbl>\n1 B 2001 5004. 7999 0.0000339 0.501 1.00\n2 C 2001 5001. 7999 0.0000305 0.501 1.00\n3 A 2001 4999. 7999 0.0000379 0.498 1.00\n\ntoc()\n\n0.403 sec elapsed\n\n\n\n\n\npartitioned_parquet_polars_lazy <- function(variables) {\n \n result <- pl$scan_parquet(file = \"Datasets/DataMultiTypes.parquet\")$\n # Conversion of 2 columns to Date format\n with_columns(\n pl$col(\"colDate1\")$str$strptime(pl$Date, \"%F %T\", strict = FALSE),\n pl$col(\"colDate2\")$str$strptime(pl$Date, \"%F %T\", strict = FALSE)\n )$\n # Filter rows\n filter(\n pl$col(\"colInt\")>2000 & pl$col(\"colInt\")<8000\n )$\n # Grouping and aggregation\n groupby(\n \"colString\")$\n agg(\n pl$col(\"colInt\")$min()$alias(\"min_colInt\"),\n pl$col(\"colInt\")$mean()$alias(\"mean_colInt\"),\n pl$col(\"colInt\")$max()$alias(\"max_colInt\"),\n pl$col(\"colNum\")$min()$alias(\"min_colNum\"),\n pl$col(\"colNum\")$mean()$alias(\"mean_colNum\"),\n pl$col(\"colNum\")$max()$alias(\"max_colNum\")\n )$collect()\n \n return(result)\n}\ntic()\npartitioned_parquet_polars_lazy()\n\nshape: (3, 7)\nβ”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”\nβ”‚ colString ┆ min_colInt ┆ mean_colInt ┆ max_colInt ┆ min_colNum ┆ mean_colNum ┆ max_colNum β”‚\nβ”‚ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- β”‚\nβ”‚ str ┆ i32 ┆ f64 ┆ i32 ┆ f64 ┆ f64 ┆ f64 β”‚\nβ•žβ•β•β•β•β•β•β•β•β•β•β•β•ͺ════════════β•ͺ═════════════β•ͺ════════════β•ͺ════════════β•ͺ═════════════β•ͺ════════════║\nβ”‚ B ┆ 2001 ┆ 5004.31148 ┆ 7999 ┆ 0.000034 ┆ 0.500546 ┆ 0.999986 β”‚\nβ”‚ C ┆ 2001 ┆ 5001.243285 ┆ 7999 ┆ 0.00003 ┆ 0.501472 ┆ 0.999992 β”‚\nβ”‚ A ┆ 2001 ┆ 4998.624945 ┆ 7999 ┆ 0.000038 ┆ 0.498445 ┆ 0.999988 β”‚\nβ””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜\n\ntoc()\n\n0.041 sec elapsed\n\n\n\n\n\n\n5.4.1 Results for partitioned parquet files\n\npartitioned_parquet_bmk <- microbenchmark(\n \"arrow (lazy) - from partitioned parquet file\" = partitioned_parquet_arrow_lazy(),\n \"dplyr (duckdb) - from partitioned parquet file\" = partitioned_parquet_dplyr_duckdb(),\n \"polars (lazy) - from partitioned parquet file\" = partitioned_parquet_polars_lazy()$to_data_frame(),\n times = 5\n )\nprint(partitioned_parquet_bmk)\n\nUnit: milliseconds\n expr min lq mean\n arrow (lazy) - from partitioned parquet file 124.9216 125.9604 131.99446\n dplyr (duckdb) - from partitioned parquet file 346.2294 359.3282 364.48044\n polars (lazy) - from partitioned parquet file 40.0859 42.1322 44.76884\n median uq max neval\n 130.9570 135.9574 142.1759 5\n 360.2117 362.5724 394.0605 5\n 42.9752 43.5088 55.1421 5\n\n\nπŸ‘‰ Conclusion of this little benchmark based on partitioned parquet files: as for unique parquet files, the big winner is polars (lazy mode) ! πŸ†πŸ†πŸ†" }, { "objectID": "benchmarking.html#from-a-duckdb-file", "href": "benchmarking.html#from-a-duckdb-file", "title": "5Β  Benchmarking", "section": "5.5 From a DuckDb file", - "text": "5.5 From a DuckDb file\nLet’s look at how to perform queries on duckdb files.\nFor this comparison, we will use :\n\nFor SQL, the DBI::dbGetQuery() method. In this way, we use the standard DBI methods to work from a DuckDb file.\n\n\nSQL\n\n\n\nduckdb_dbfile_sql <- function(variables) {\n \n con <- dbConnect(duckdb::duckdb(),\n \"Datasets/DataMultiTypes.duckdb\")\n \n result <- dbGetQuery(\n con, \n \"SELECT colString,\n MIN(colInt) AS min_colInt,\n AVG(colInt) AS mean_colInt,\n MAX(colInt) AS max_colInt,\n MIN(colNum) AS min_colNum,\n AVG(colNum) AS mean_colNum,\n MAX(colNum) AS max_colNum\n FROM (\n SELECT colString,\n colInt,\n colNum\n FROM DataMultiTypes\n WHERE colInt > 2000 AND colInt < 8000\n) AS filtered_data\nGROUP BY colString;\")\n \n dbDisconnect(con, shutdown=TRUE)\n \n return(result)\n \n}\ntic()\nduckdb_dbfile_sql()\n\n colString min_colInt mean_colInt max_colInt min_colNum mean_colNum\n1 B 2001 5004.311 7999 3.385660e-05 0.5005457\n2 A 2001 4998.625 7999 3.794138e-05 0.4984446\n3 C 2001 5001.243 7999 3.045052e-05 0.5014723\n max_colNum\n1 0.9999863\n2 0.9999879\n3 0.9999921\n\ntoc()\n\n0.082 sec elapsed\n\n\n\n\n\n\n5.5.1 Results for DuckDB file\n\nduckdb_bmk <- microbenchmark(\n \"SQL from duckdb file\" = duckdb_dbfile_sql(),\n times = 5\n )\nduckdb_bmk\n\nUnit: milliseconds\n expr min lq mean median uq max neval\n SQL from duckdb file 79.5339 81.5291 84.39356 84.5486 85.172 91.1842 5\n\n\nNote that the query with the standard DBI methods is faster than those with dplyr verbs πŸ†" + "text": "5.5 From a DuckDb file\nLet’s look at how to perform queries on duckdb files.\nFor this comparison, we will use :\n\nFor SQL, the DBI::dbGetQuery() method. In this way, we use the standard DBI methods to work from a DuckDb file.\n\n\nSQL\n\n\n\nduckdb_dbfile_sql <- function(variables) {\n \n con <- dbConnect(duckdb::duckdb(),\n \"Datasets/DataMultiTypes.duckdb\")\n \n result <- dbGetQuery(\n con, \n \"SELECT colString,\n MIN(colInt) AS min_colInt,\n AVG(colInt) AS mean_colInt,\n MAX(colInt) AS max_colInt,\n MIN(colNum) AS min_colNum,\n AVG(colNum) AS mean_colNum,\n MAX(colNum) AS max_colNum\n FROM (\n SELECT colString,\n colInt,\n colNum\n FROM DataMultiTypes\n WHERE colInt > 2000 AND colInt < 8000\n) AS filtered_data\nGROUP BY colString;\")\n \n dbDisconnect(con, shutdown=TRUE)\n \n return(result)\n \n}\ntic()\nduckdb_dbfile_sql()\n\n colString min_colInt mean_colInt max_colInt min_colNum mean_colNum\n1 B 2001 5004.311 7999 3.385660e-05 0.5005457\n2 C 2001 5001.243 7999 3.045052e-05 0.5014723\n3 A 2001 4998.625 7999 3.794138e-05 0.4984446\n max_colNum\n1 0.9999863\n2 0.9999921\n3 0.9999879\n\ntoc()\n\n0.082 sec elapsed\n\n\n\n\n\n\n5.5.1 Results for DuckDB file\n\nduckdb_bmk <- microbenchmark(\n \"SQL from duckdb file\" = duckdb_dbfile_sql(),\n times = 5\n )\nduckdb_bmk\n\nUnit: milliseconds\n expr min lq mean median uq max neval\n SQL from duckdb file 73.6214 75.591 78.45332 77.7204 81.3255 84.0083 5\n\n\nNote that the query with the standard DBI methods is faster than those with dplyr verbs πŸ†" }, { "objectID": "benchmarking.html#final-results",