From 1b5630db010e1089d2fa43cf288dd97935520033 Mon Sep 17 00:00:00 2001
From: ritchie <ritchie46@gmail.com>
Date: Mon, 25 Nov 2024 13:44:52 +0100
Subject: [PATCH 01/10] fix: Fix Polars queries

---
 polars/query.py | 128 +++++++++++++++++++++++-------------------------
 1 file changed, 61 insertions(+), 67 deletions(-)

diff --git a/polars/query.py b/polars/query.py
index 96df85dd5..269463acf 100755
--- a/polars/query.py
+++ b/polars/query.py
@@ -3,7 +3,7 @@
 import pandas as pd
 import polars as pl
 import timeit
-import datetime
+from datetime import datetime, date
 import json
 
 hits = pd.read_parquet("hits.parquet")
@@ -22,48 +22,48 @@
         hits[col] = hits[col].astype(str)
 
 start = timeit.default_timer()
-pl_df = pl.DataFrame(hits)
+pl_df = pl.DataFrame(hits).rechunk()
 stop = timeit.default_timer()
 load_time = stop - start
 
 # 0: No., 1: SQL, 2: Pandas, 3: Polars
-queries = queries = [
-    ("Q0", "SELECT COUNT(*) FROM hits;", lambda x: x.count(), lambda x: x.height),
+queries = [
+    ("Q0", "SELECT COUNT(*) FROM hits;", lambda x: x.count(), lambda x: x.select(pl.len()).collect().height),
     (
         "Q1",
         "SELECT COUNT(*) FROM hits WHERE AdvEngineID <> 0;",
         lambda x: x[x["AdvEngineID"] != 0].count(),
-        lambda x: x.filter(pl.col("AdvEngineID") != 0).height,
+        lambda x: x.select(pl.col("AdvEngineID").filter(pl.col("AdvEngineID") != 0).count()).collect().height,
     ),
     (
         "Q2",
         "SELECT SUM(AdvEngineID), COUNT(*), AVG(ResolutionWidth) FROM hits;",
         lambda x: (x["AdvEngineID"].sum(), x.shape[0], x["ResolutionWidth"].mean()),
-        lambda x: (x["AdvEngineID"].sum(), x.height, x["ResolutionWidth"].mean()),
+        lambda x: (x.select(pl.col("advengineid").sum()).collect().item(), x.select(pl.len()).collect().item(), x.select(pl.col("advengineid").mean()).collect().item()),
     ),
     (
         "Q3",
         "SELECT AVG(UserID) FROM hits;",
         lambda x: x["UserID"].mean(),
-        lambda x: x["UserID"].mean(),
+        lambda x: x.select(pl.col("UserID").mean()).collect().item(),
     ),
     (
         "Q4",
         "SELECT COUNT(DISTINCT UserID) FROM hits;",
         lambda x: x["UserID"].nunique(),
-        lambda x: x["UserID"].n_unique(),
+        lambda x: x.select(pl.col("UserID").n_unique()).collect().item(),
     ),
     (
         "Q5",
         "SELECT COUNT(DISTINCT SearchPhrase) FROM hits;",
         lambda x: x["SearchPhrase"].nunique(),
-        lambda x: x["SearchPhrase"].n_unique(),
+        lambda x: x.select(pl.col("SearchPhrase").n_unique()).collect().item(),
     ),
     (
         "Q6",
         "SELECT MIN(EventDate), MAX(EventDate) FROM hits;",
         lambda x: (x["EventDate"].min(), x["EventDate"].max()),
-        lambda x: (x["EventDate"].min(), x["EventDate"].max()),
+        lambda x: x.select(pl.col("EventDate").min().alias("e_min"), pl.col("EventDate").max("e_max")).collect().rows()[0]
     ),
     (
         "Q7",
@@ -75,7 +75,7 @@
         lambda x: x.filter(pl.col("AdvEngineID") != 0)
         .group_by("AdvEngineID")
         .agg(pl.len().alias("count"))
-        .sort("count", descending=True),
+        .sort("count", descending=True).collect(),
     ),
     (
         "Q8",
@@ -84,7 +84,7 @@
         lambda x: x.group_by("RegionID")
         .agg(pl.col("UserID").n_unique())
         .sort("UserID", descending=True)
-        .head(10),
+        .head(10).collect(),
     ),
     (
         "Q9",
@@ -101,7 +101,7 @@
             ]
         )
         .sort("AdvEngineID_sum", descending=True)
-        .head(10),
+        .head(10).collect(),
     ),
     (
         "Q10",
@@ -114,7 +114,7 @@
         .group_by("MobilePhoneModel")
         .agg(pl.col("UserID").n_unique())
         .sort("UserID", descending=True)
-        .head(10),
+        .head(10).collect(),
     ),
     (
         "Q11",
@@ -127,7 +127,7 @@
         .group_by(["MobilePhone", "MobilePhoneModel"])
         .agg(pl.col("UserID").n_unique())
         .sort("UserID", descending=True)
-        .head(10),
+        .head(10).collect(),
     ),
     (
         "Q12",
@@ -140,7 +140,7 @@
         .group_by("SearchPhrase")
         .agg(pl.len().alias("count"))
         .sort("count", descending=True)
-        .head(10),
+        .head(10).collect(),
     ),
     (
         "Q13",
@@ -153,7 +153,7 @@
         .group_by("SearchPhrase")
         .agg(pl.col("UserID").n_unique())
         .sort("UserID", descending=True)
-        .head(10),
+        .head(10).collect(),
     ),
     (
         "Q14",
@@ -166,7 +166,7 @@
         .group_by(["SearchEngineID", "SearchPhrase"])
         .agg(pl.len().alias("count"))
         .sort("count", descending=True)
-        .head(10),
+        .head(10).collect(),
     ),
     (
         "Q15",
@@ -175,7 +175,7 @@
         lambda x: x.group_by("UserID")
         .agg(pl.len().alias("count"))
         .sort("count", descending=True)
-        .head(10),
+        .head(10).collect(),
     ),
     (
         "Q16",
@@ -184,13 +184,13 @@
         lambda x: x.group_by(["UserID", "SearchPhrase"])
         .agg(pl.len().alias("count"))
         .sort("count", descending=True)
-        .head(10),
+        .head(10).collect(),
     ),
     (
         "Q17",
         "SELECT UserID, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, SearchPhrase LIMIT 10;",
         lambda x: x.groupby(["UserID", "SearchPhrase"]).size().head(10),
-        lambda x: x.group_by(["UserID", "SearchPhrase"]).agg(pl.len()).head(10),
+        lambda x: x.group_by(["UserID", "SearchPhrase"]).agg(pl.len()).head(10).collect(),
     ),
     (
         "Q18",
@@ -203,19 +203,19 @@
         )
         .agg(pl.len().alias("count"))
         .sort("count", descending=True)
-        .head(10),
+        .head(10).collect(),
     ),
     (
         "Q19",
         "SELECT UserID FROM hits WHERE UserID = 435090932899640449;",
         lambda x: x[x["UserID"] == 435090932899640449],
-        lambda x: x.filter(pl.col("UserID") == 435090932899640449),
+        lambda x: x.select("UserID").filter(pl.col("UserID") == 435090932899640449).collect(),
     ),
     (
         "Q20",
         "SELECT COUNT(*) FROM hits WHERE URL LIKE '%google%';",
         lambda x: x[x["URL"].str.contains("google")].shape[0],
-        lambda x: x.filter(pl.col("URL").str.contains("google")).height,
+        lambda x: x.filter(pl.col("URL").str.contains("google")).select(pl.len()).collect().item(),
     ),
     (
         "Q21",
@@ -230,7 +230,7 @@
         .group_by("SearchPhrase")
         .agg([pl.col("URL").min(), pl.len().alias("count")])
         .sort("count", descending=True)
-        .head(10),
+        .head(10).collect(),
     ),
     (
         "Q22",
@@ -260,7 +260,7 @@
             ]
         )
         .sort("count", descending=True)
-        .head(10),
+        .head(10).collect(),
     ),
     (
         "Q23",
@@ -270,7 +270,7 @@
         .head(10),
         lambda x: x.filter(pl.col("URL").str.contains("google"))
         .sort("EventTime")
-        .head(10),
+        .head(10).collect(),
     ),
     (
         "Q24",
@@ -281,7 +281,7 @@
         lambda x: x.filter(pl.col("SearchPhrase") != "")
         .sort("EventTime")
         .select("SearchPhrase")
-        .head(10),
+        .head(10).collect(),
     ),
     (
         "Q25",
@@ -292,7 +292,7 @@
         lambda x: x.filter(pl.col("SearchPhrase") != "")
         .sort("SearchPhrase")
         .select("SearchPhrase")
-        .head(10),
+        .head(10).collect(),
     ),
     (
         "Q26",
@@ -303,7 +303,7 @@
         lambda x: x.filter(pl.col("SearchPhrase") != "")
         .sort(["EventTime", "SearchPhrase"])
         .select("SearchPhrase")
-        .head(10),
+        .head(10).collect(),
     ),
     (
         "Q27",
@@ -318,15 +318,13 @@
         .group_by("CounterID")  # GROUP BY CounterID
         .agg(
             [
-                pl.col("URL")
-                .map_elements(lambda y: len(y), return_dtype=pl.Int64)
-                .alias("l"),  # AVG(STRLEN(URL))
+                pl.col("URL").str.len_chars().mean().alias("l"),  # AVG(STRLEN(URL))
                 pl.len().alias("c"),  # COUNT(*)
             ]
         )
         .filter(pl.col("c") > 100000)  # HAVING COUNT(*) > 100000
         .sort("l", descending=True)  # ORDER BY l DESC
-        .limit(25),  # LIMIT 25,
+        .limit(25).collect(),  # LIMIT 25,
     ),
     (
         "Q28",
@@ -352,18 +350,14 @@
             .group_by("k")
             .agg(
                 [
-                    pl.col("Referer").map_elements(
-                        lambda y: len(y), return_dtype=pl.Int64
-                    )
-                    # .mean() # skip mean for now
-                    .alias("l"),  # AVG(STRLEN(Referer))
+                    pl.col("Referer").str.len_chars().mean().alias("l"),  # AVG(STRLEN(Referer))
                     pl.col("Referer").min().alias("min_referer"),  # MIN(Referer)
                     pl.len().alias("c"),  # COUNT(*)
                 ]
             )
             .filter(pl.col("c") > 100000)  # HAVING COUNT(*) > 100000
             .sort("l", descending=True)  # ORDER BY l DESC
-            .limit(25)  # LIMIT 25
+            .limit(25).collect()  # LIMIT 25
         ),
     ),
     (
@@ -459,7 +453,7 @@
         + x["ResolutionWidth"].shift(87).sum()
         + x["ResolutionWidth"].shift(88).sum()
         + x["ResolutionWidth"].shift(89).sum(),
-        lambda x: sum(x["ResolutionWidth"][:90] + pl.Series(range(90))),
+        lambda x: x.select(pl.sum_horizontal([pl.col("ResolutionWidth").shift(i) for i in range(1, 90)])).collect(),
     ),
     (
         "Q30",
@@ -482,7 +476,7 @@
             ]
         )
         .sort("c", descending=True)
-        .head(10),
+        .head(10).collect(),
     ),
     (
         "Q31",
@@ -505,7 +499,7 @@
             ]
         )
         .sort("c", descending=True)
-        .head(10),
+        .head(10).collect(),
     ),
     (
         "Q32",
@@ -526,7 +520,7 @@
             ]
         )
         .sort("c", descending=True)
-        .head(10),
+        .head(10).collect(),
     ),
     (
         "Q33",
@@ -535,7 +529,7 @@
         lambda x: x.group_by("URL")
         .agg(pl.len().alias("c"))
         .sort("c", descending=True)
-        .head(10),
+        .head(10).collect(),
     ),
     (
         "Q34",
@@ -544,7 +538,7 @@
         lambda x: x.group_by("URL")
         .agg(pl.len().alias("c"))
         .sort("c", descending=True)
-        .head(10),
+        .head(10).collect(),
     ),
     (
         "Q35",
@@ -562,7 +556,7 @@
         .group_by(["ClientIP"])
         .agg(pl.len().alias("c"))
         .sort("c", descending=True)
-        .head(10),
+        .head(10).collect(),
     ),
     (
         "Q36",
@@ -580,8 +574,8 @@
         .nlargest(10),
         lambda x: x.filter(
             (pl.col("CounterID") == 62)
-            & (pl.col("EventDate") >= pl.datetime(2013, 7, 1))
-            & (pl.col("EventDate") <= pl.datetime(2013, 7, 31))
+            & (pl.col("EventDate") >= datetime(2013, 7, 1))
+            & (pl.col("EventDate") <= datetime(2013, 7, 31))
             & (pl.col("DontCountHits") == 0)
             & (pl.col("IsRefresh") == 0)
             & (pl.col("URL") != "")
@@ -589,7 +583,7 @@
         .group_by("URL")
         .agg(pl.len().alias("PageViews"))
         .sort("PageViews", descending=True)
-        .head(10),
+        .head(10).collect(),
     ),
     (
         "Q37",
@@ -607,8 +601,8 @@
         .nlargest(10),
         lambda x: x.filter(
             (pl.col("CounterID") == 62)
-            & (pl.col("EventDate") >= pl.datetime(2013, 7, 1))
-            & (pl.col("EventDate") <= pl.datetime(2013, 7, 31))
+            & (pl.col("EventDate") >= datetime(2013, 7, 1))
+            & (pl.col("EventDate") <= datetime(2013, 7, 31))
             & (pl.col("DontCountHits") == 0)
             & (pl.col("IsRefresh") == 0)
             & (pl.col("Title") != "")
@@ -616,7 +610,7 @@
         .group_by("Title")
         .agg(pl.len().alias("PageViews"))
         .sort("PageViews", descending=True)
-        .head(10),
+        .head(10).collect(),
     ),
     (
         "Q38",
@@ -636,8 +630,8 @@
         .iloc[1000:1010],
         lambda x: x.filter(
             (pl.col("CounterID") == 62)
-            & (pl.col("EventDate") >= pl.datetime(2013, 7, 1))
-            & (pl.col("EventDate") <= pl.datetime(2013, 7, 31))
+            & (pl.col("EventDate") >= datetime(2013, 7, 1))
+            & (pl.col("EventDate") <= datetime(2013, 7, 31))
             & (pl.col("IsRefresh") == 0)
             & (pl.col("IsLink") != 0)
             & (pl.col("IsDownload") == 0)
@@ -645,7 +639,7 @@
         .group_by("URL")
         .agg(pl.len().alias("PageViews"))
         .sort("PageViews", descending=True)
-        .slice(1000, 10),
+        .slice(1000, 10).collect(),
     ),
     (
         "Q39",
@@ -668,8 +662,8 @@
         #   note: run with `RUST_BACKTRACE=1` environment variable to display a backtrace
         # lambda x: x.filter(
         #     (pl.col("CounterID") == 62)
-        #     & (pl.col("EventDate") >= pl.datetime(2013, 7, 1))
-        #     & (pl.col("EventDate") <= pl.datetime(2013, 7, 31))
+        #     & (pl.col("EventDate") >= datetime(2013, 7, 1))
+        #     & (pl.col("EventDate") <= datetime(2013, 7, 31))
         #     & (pl.col("IsRefresh") == 0)
         # )
         # .group_by(
@@ -706,8 +700,8 @@
         .iloc[100:110],
         lambda x: x.filter(
             (pl.col("CounterID") == 62)
-            & (pl.col("EventDate") >= pl.datetime(2013, 7, 1))
-            & (pl.col("EventDate") <= pl.datetime(2013, 7, 31))
+            & (pl.col("EventDate") >= datetime(2013, 7, 1))
+            & (pl.col("EventDate") <= datetime(2013, 7, 31))
             & (pl.col("IsRefresh") == 0)
             & (pl.col("TraficSourceID").is_in([-1, 6]))
             & (pl.col("RefererHash") == 3594120000172545465)
@@ -715,7 +709,7 @@
         .group_by(["URLHash", "EventDate"])
         .agg(pl.len().alias("PageViews"))
         .sort("PageViews", descending=True)
-        .slice(100, 10),
+        .slice(100, 10).collect(),
     ),
     (
         "Q41",
@@ -735,8 +729,8 @@
         .iloc[10000:10010],
         lambda x: x.filter(
             (pl.col("CounterID") == 62)
-            & (pl.col("EventDate") >= pl.datetime(2013, 7, 1))
-            & (pl.col("EventDate") <= pl.datetime(2013, 7, 31))
+            & (pl.col("EventDate") >= datetime(2013, 7, 1))
+            & (pl.col("EventDate") <= datetime(2013, 7, 31))
             & (pl.col("IsRefresh") == 0)
             & (pl.col("DontCountHits") == 0)
             & (pl.col("URLHash") == 2868770270353813622)
@@ -744,7 +738,7 @@
         .group_by(["WindowClientWidth", "WindowClientHeight"])
         .agg(pl.len().alias("PageViews"))
         .sort("PageViews", descending=True)
-        .slice(10000, 10),
+        .slice(10000, 10).collect(),
     ),
     (
         "Q42",
@@ -766,8 +760,8 @@
         #   expected leading integer in the duration string, found m
         # lambda x: x.filter(
         #     (pl.col("CounterID") == 62)
-        #     & (pl.col("EventDate") >= pl.datetime(2013, 7, 14))
-        #     & (pl.col("EventDate") <= pl.datetime(2013, 7, 15))
+        #     & (pl.col("EventDate") >= datetime(2013, 7, 14))
+        #     & (pl.col("EventDate") <= datetime(2013, 7, 15))
         #     & (pl.col("IsRefresh") == 0)
         #     & (pl.col("DontCountHits") == 0)
         # )
@@ -792,7 +786,7 @@
 
 result_json = {
     "system": "Polars (DataFrame)",
-    "date": datetime.date.today().strftime("%Y-%m-%d"),
+    "date": date.today().strftime("%Y-%m-%d"),
     "machine": "c6a.metal, 500gb gp2",
     "cluster_size": 1,
     "comment": "",

From 77c10574f58e14f5205abd8e5c7d32f161d6abfe Mon Sep 17 00:00:00 2001
From: ritchie <ritchie46@gmail.com>
Date: Mon, 25 Nov 2024 13:46:32 +0100
Subject: [PATCH 02/10] single query

---
 polars/query.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/polars/query.py b/polars/query.py
index 269463acf..75eb3d8c6 100755
--- a/polars/query.py
+++ b/polars/query.py
@@ -39,7 +39,7 @@
         "Q2",
         "SELECT SUM(AdvEngineID), COUNT(*), AVG(ResolutionWidth) FROM hits;",
         lambda x: (x["AdvEngineID"].sum(), x.shape[0], x["ResolutionWidth"].mean()),
-        lambda x: (x.select(pl.col("advengineid").sum()).collect().item(), x.select(pl.len()).collect().item(), x.select(pl.col("advengineid").mean()).collect().item()),
+        lambda x: x.select(pl.col("advengineid").sum(), pl.len(), pl.col("advengineid").mean()).collect().rows()[0],
     ),
     (
         "Q3",

From 72d910b945a41a15d471b671e70106812ebddf27 Mon Sep 17 00:00:00 2001
From: ritchie <ritchie46@gmail.com>
Date: Mon, 25 Nov 2024 13:51:05 +0100
Subject: [PATCH 03/10] pass lazy

---
 polars/query.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/polars/query.py b/polars/query.py
index 75eb3d8c6..9afbb5fa2 100755
--- a/polars/query.py
+++ b/polars/query.py
@@ -22,7 +22,7 @@
         hits[col] = hits[col].astype(str)
 
 start = timeit.default_timer()
-pl_df = pl.DataFrame(hits).rechunk()
+pl_df = pl.DataFrame(hits).rechunk().lazy()
 stop = timeit.default_timer()
 load_time = stop - start
 

From bd16ac32e11bdda674944610ec6e679254641676 Mon Sep 17 00:00:00 2001
From: ritchie <ritchie46@gmail.com>
Date: Tue, 26 Nov 2024 08:26:51 +0100
Subject: [PATCH 04/10] remove pandas and fix all queries

---
 polars/benchmark.sh |   2 +-
 polars/query.py     | 407 ++++++--------------------------------------
 2 files changed, 50 insertions(+), 359 deletions(-)

diff --git a/polars/benchmark.sh b/polars/benchmark.sh
index 16980aa40..10ad7e61f 100755
--- a/polars/benchmark.sh
+++ b/polars/benchmark.sh
@@ -4,7 +4,7 @@
 
 sudo apt-get update
 sudo apt-get install -y python3-pip
-pip install --break-system-packages pandas polars
+pip install --break-system-packages polars
 
 # Download the data
 wget --no-verbose --continue https://datasets.clickhouse.com/hits_compatible/athena/hits.parquet
diff --git a/polars/query.py b/polars/query.py
index 9afbb5fa2..feb798f26 100755
--- a/polars/query.py
+++ b/polars/query.py
@@ -1,77 +1,62 @@
 #!/usr/bin/env python3
 
-import pandas as pd
 import polars as pl
 import timeit
 from datetime import datetime, date
 import json
 
-hits = pd.read_parquet("hits.parquet")
-
-dataframe_size = hits.memory_usage().sum()
+start = timeit.default_timer()
+df = pl.scan_parquet("hits.parquet").collect()
+stop = timeit.default_timer()
+load_time = stop - start
 
-# print("Dataframe(numpy) size:", dataframe_size, "bytes")
 
 # fix some types
-hits["EventTime"] = pd.to_datetime(hits["EventTime"], unit="s")
-hits["EventDate"] = pd.to_datetime(hits["EventDate"], unit="D")
-
-# fix all object columns to string
-for col in hits.columns:
-    if hits[col].dtype == "O":
-        hits[col] = hits[col].astype(str)
+df = df.with_columns(
+    (pl.col("EventTime") * 1000).cast(pl.Datetime(time_unit="ms")),
+    pl.col("EventDate").cast(pl.Date),
+)
+assert df["EventTime"][0].year == 2013
+df = df.rechunk()
 
-start = timeit.default_timer()
-pl_df = pl.DataFrame(hits).rechunk().lazy()
-stop = timeit.default_timer()
-load_time = stop - start
+lf = df.lazy()
 
-# 0: No., 1: SQL, 2: Pandas, 3: Polars
+# 0: No., 1: SQL, 3: Polars
 queries = [
-    ("Q0", "SELECT COUNT(*) FROM hits;", lambda x: x.count(), lambda x: x.select(pl.len()).collect().height),
+    ("Q0", "SELECT COUNT(*) FROM hits;", lambda x: x.select(pl.len()).collect().height),
     (
         "Q1",
         "SELECT COUNT(*) FROM hits WHERE AdvEngineID <> 0;",
-        lambda x: x[x["AdvEngineID"] != 0].count(),
         lambda x: x.select(pl.col("AdvEngineID").filter(pl.col("AdvEngineID") != 0).count()).collect().height,
     ),
     (
         "Q2",
         "SELECT SUM(AdvEngineID), COUNT(*), AVG(ResolutionWidth) FROM hits;",
-        lambda x: (x["AdvEngineID"].sum(), x.shape[0], x["ResolutionWidth"].mean()),
-        lambda x: x.select(pl.col("advengineid").sum(), pl.len(), pl.col("advengineid").mean()).collect().rows()[0],
+        lambda x: x.select(a_sum=pl.col("AdvEngineID").sum(), count=pl.len(), a_mean=pl.col("AdvEngineID").mean()).collect().rows()[0],
     ),
     (
         "Q3",
         "SELECT AVG(UserID) FROM hits;",
-        lambda x: x["UserID"].mean(),
         lambda x: x.select(pl.col("UserID").mean()).collect().item(),
     ),
     (
         "Q4",
         "SELECT COUNT(DISTINCT UserID) FROM hits;",
-        lambda x: x["UserID"].nunique(),
         lambda x: x.select(pl.col("UserID").n_unique()).collect().item(),
     ),
     (
         "Q5",
         "SELECT COUNT(DISTINCT SearchPhrase) FROM hits;",
-        lambda x: x["SearchPhrase"].nunique(),
         lambda x: x.select(pl.col("SearchPhrase").n_unique()).collect().item(),
     ),
     (
         "Q6",
         "SELECT MIN(EventDate), MAX(EventDate) FROM hits;",
-        lambda x: (x["EventDate"].min(), x["EventDate"].max()),
-        lambda x: x.select(pl.col("EventDate").min().alias("e_min"), pl.col("EventDate").max("e_max")).collect().rows()[0]
+        lambda x: x.select(e_min=pl.col("EventDate").min(), e_max=pl.col("EventDate").max()).collect().rows()[0]
     ),
     (
         "Q7",
         "SELECT AdvEngineID, COUNT(*) FROM hits WHERE AdvEngineID <> 0 GROUP BY AdvEngineID ORDER BY COUNT(*) DESC;",
-        lambda x: x[x["AdvEngineID"] != 0]
-        .groupby("AdvEngineID")
-        .size()
-        .sort_values(ascending=False),
         lambda x: x.filter(pl.col("AdvEngineID") != 0)
         .group_by("AdvEngineID")
         .agg(pl.len().alias("count"))
@@ -80,7 +65,6 @@
     (
         "Q8",
         "SELECT RegionID, COUNT(DISTINCT UserID) AS u FROM hits GROUP BY RegionID ORDER BY u DESC LIMIT 10;",
-        lambda x: x.groupby("RegionID")["UserID"].nunique().nlargest(10),
         lambda x: x.group_by("RegionID")
         .agg(pl.col("UserID").n_unique())
         .sort("UserID", descending=True)
@@ -89,9 +73,6 @@
     (
         "Q9",
         "SELECT RegionID, SUM(AdvEngineID), COUNT(*) AS c, AVG(ResolutionWidth), COUNT(DISTINCT UserID) FROM hits GROUP BY RegionID ORDER BY c DESC LIMIT 10;",
-        lambda x: x.groupby("RegionID")
-        .agg({"AdvEngineID": "sum", "ResolutionWidth": "mean", "UserID": "nunique"})
-        .nlargest(10, "AdvEngineID"),
         lambda x: x.group_by("RegionID")
         .agg(
             [
@@ -106,10 +87,6 @@
     (
         "Q10",
         "SELECT MobilePhoneModel, COUNT(DISTINCT UserID) AS u FROM hits WHERE MobilePhoneModel <> '' GROUP BY MobilePhoneModel ORDER BY u DESC LIMIT 10;",
-        lambda x: x[x["MobilePhoneModel"] != ""]
-        .groupby("MobilePhoneModel")["UserID"]
-        .nunique()
-        .nlargest(10),
         lambda x: x.filter(pl.col("MobilePhoneModel") != "")
         .group_by("MobilePhoneModel")
         .agg(pl.col("UserID").n_unique())
@@ -119,10 +96,6 @@
     (
         "Q11",
         "SELECT MobilePhone, MobilePhoneModel, COUNT(DISTINCT UserID) AS u FROM hits WHERE MobilePhoneModel <> '' GROUP BY MobilePhone, MobilePhoneModel ORDER BY u DESC LIMIT 10;",
-        lambda x: x[x["MobilePhoneModel"] != ""]
-        .groupby(["MobilePhone", "MobilePhoneModel"])["UserID"]
-        .nunique()
-        .nlargest(10),
         lambda x: x.filter(pl.col("MobilePhoneModel") != "")
         .group_by(["MobilePhone", "MobilePhoneModel"])
         .agg(pl.col("UserID").n_unique())
@@ -132,10 +105,6 @@
     (
         "Q12",
         "SELECT SearchPhrase, COUNT(*) AS c FROM hits WHERE SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10;",
-        lambda x: x[x["SearchPhrase"] != ""]
-        .groupby("SearchPhrase")
-        .size()
-        .nlargest(10),
         lambda x: x.filter(pl.col("SearchPhrase") != "")
         .group_by("SearchPhrase")
         .agg(pl.len().alias("count"))
@@ -145,10 +114,6 @@
     (
         "Q13",
         "SELECT SearchPhrase, COUNT(DISTINCT UserID) AS u FROM hits WHERE SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY u DESC LIMIT 10;",
-        lambda x: x[x["SearchPhrase"] != ""]
-        .groupby("SearchPhrase")["UserID"]
-        .nunique()
-        .nlargest(10),
         lambda x: x.filter(pl.col("SearchPhrase") != "")
         .group_by("SearchPhrase")
         .agg(pl.col("UserID").n_unique())
@@ -158,10 +123,6 @@
     (
         "Q14",
         "SELECT SearchEngineID, SearchPhrase, COUNT(*) AS c FROM hits WHERE SearchPhrase <> '' GROUP BY SearchEngineID, SearchPhrase ORDER BY c DESC LIMIT 10;",
-        lambda x: x[x["SearchPhrase"] != ""]
-        .groupby(["SearchEngineID", "SearchPhrase"])
-        .size()
-        .nlargest(10),
         lambda x: x.filter(pl.col("SearchPhrase") != "")
         .group_by(["SearchEngineID", "SearchPhrase"])
         .agg(pl.len().alias("count"))
@@ -171,7 +132,6 @@
     (
         "Q15",
         "SELECT UserID, COUNT(*) FROM hits GROUP BY UserID ORDER BY COUNT(*) DESC LIMIT 10;",
-        lambda x: x.groupby("UserID").size().nlargest(10),
         lambda x: x.group_by("UserID")
         .agg(pl.len().alias("count"))
         .sort("count", descending=True)
@@ -180,7 +140,6 @@
     (
         "Q16",
         "SELECT UserID, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, SearchPhrase ORDER BY COUNT(*) DESC LIMIT 10;",
-        lambda x: x.groupby(["UserID", "SearchPhrase"]).size().nlargest(10),
         lambda x: x.group_by(["UserID", "SearchPhrase"])
         .agg(pl.len().alias("count"))
         .sort("count", descending=True)
@@ -189,17 +148,13 @@
     (
         "Q17",
         "SELECT UserID, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, SearchPhrase LIMIT 10;",
-        lambda x: x.groupby(["UserID", "SearchPhrase"]).size().head(10),
         lambda x: x.group_by(["UserID", "SearchPhrase"]).agg(pl.len()).head(10).collect(),
     ),
     (
         "Q18",
         "SELECT UserID, extract(minute FROM EventTime) AS m, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, m, SearchPhrase ORDER BY COUNT(*) DESC LIMIT 10;",
-        lambda x: x.groupby([x["UserID"], x["EventTime"].dt.minute, "SearchPhrase"])
-        .size()
-        .nlargest(10),
         lambda x: x.group_by(
-            [pl.col("UserID"), x["EventTime"].dt.minute(), "SearchPhrase"]
+            [pl.col("UserID"), pl.col("EventTime").dt.minute(), "SearchPhrase"]
         )
         .agg(pl.len().alias("count"))
         .sort("count", descending=True)
@@ -208,22 +163,16 @@
     (
         "Q19",
         "SELECT UserID FROM hits WHERE UserID = 435090932899640449;",
-        lambda x: x[x["UserID"] == 435090932899640449],
         lambda x: x.select("UserID").filter(pl.col("UserID") == 435090932899640449).collect(),
     ),
     (
         "Q20",
         "SELECT COUNT(*) FROM hits WHERE URL LIKE '%google%';",
-        lambda x: x[x["URL"].str.contains("google")].shape[0],
         lambda x: x.filter(pl.col("URL").str.contains("google")).select(pl.len()).collect().item(),
     ),
     (
         "Q21",
         "SELECT SearchPhrase, MIN(URL), COUNT(*) AS c FROM hits WHERE URL LIKE '%google%' AND SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10;",
-        lambda x: x[(x["URL"].str.contains("google")) & (x["SearchPhrase"] != "")]
-        .groupby("SearchPhrase")
-        .agg({"URL": "min", "SearchPhrase": "size"})
-        .nlargest(10, "SearchPhrase"),
         lambda x: x.filter(
             (pl.col("URL").str.contains("google")) & (pl.col("SearchPhrase") != "")
         )
@@ -235,16 +184,6 @@
     (
         "Q22",
         "SELECT SearchPhrase, MIN(URL), MIN(Title), COUNT(*) AS c, COUNT(DISTINCT UserID) FROM hits WHERE Title LIKE '%Google%' AND URL NOT LIKE '%.google.%' AND SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10;",
-        lambda x: x[
-            (x["Title"].str.contains("Google"))
-            & (~x["URL"].str.contains(".google."))
-            & (x["SearchPhrase"] != "")
-        ]
-        .groupby("SearchPhrase")
-        .agg(
-            {"URL": "min", "Title": "min", "SearchPhrase": "size", "UserID": "nunique"}
-        )
-        .nlargest(10, "SearchPhrase"),
         lambda x: x.filter(
             (pl.col("Title").str.contains("Google"))
             & (~pl.col("URL").str.contains(".google."))
@@ -265,9 +204,6 @@
     (
         "Q23",
         "SELECT * FROM hits WHERE URL LIKE '%google%' ORDER BY EventTime LIMIT 10;",
-        lambda x: x[x["URL"].str.contains("google")]
-        .sort_values(by="EventTime")
-        .head(10),
         lambda x: x.filter(pl.col("URL").str.contains("google"))
         .sort("EventTime")
         .head(10).collect(),
@@ -275,9 +211,6 @@
     (
         "Q24",
         "SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY EventTime LIMIT 10;",
-        lambda x: x[x["SearchPhrase"] != ""]
-        .sort_values(by="EventTime")[["SearchPhrase"]]
-        .head(10),
         lambda x: x.filter(pl.col("SearchPhrase") != "")
         .sort("EventTime")
         .select("SearchPhrase")
@@ -286,9 +219,6 @@
     (
         "Q25",
         "SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY SearchPhrase LIMIT 10;",
-        lambda x: x[x["SearchPhrase"] != ""]
-        .sort_values(by="SearchPhrase")[["SearchPhrase"]]
-        .head(10),
         lambda x: x.filter(pl.col("SearchPhrase") != "")
         .sort("SearchPhrase")
         .select("SearchPhrase")
@@ -297,9 +227,6 @@
     (
         "Q26",
         "SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY EventTime, SearchPhrase LIMIT 10;",
-        lambda x: x[x["SearchPhrase"] != ""]
-        .sort_values(by=["EventTime", "SearchPhrase"])[["SearchPhrase"]]
-        .head(10),
         lambda x: x.filter(pl.col("SearchPhrase") != "")
         .sort(["EventTime", "SearchPhrase"])
         .select("SearchPhrase")
@@ -308,12 +235,6 @@
     (
         "Q27",
         "SELECT CounterID, AVG(STRLEN(URL)) AS l, COUNT(*) AS c FROM hits WHERE URL <> '' GROUP BY CounterID HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25;",
-        lambda x: x[x["URL"] != ""]
-        .groupby("CounterID")
-        .filter(lambda g: g["URL"].count() > 100000)
-        .agg({"URL": lambda url: url.str.len().mean(), "CounterID": "size"})
-        .sort_values()
-        .head(25),
         lambda x: x.filter(pl.col("URL") != "")  # WHERE URL <> ''
         .group_by("CounterID")  # GROUP BY CounterID
         .agg(
@@ -329,17 +250,6 @@
     (
         "Q28",
         "SELECT REGEXP_REPLACE(Referer, '^https?://(?:www\\.)?([^/]+)/.*$', '\\1') AS k, AVG(STRLEN(Referer)) AS l, COUNT(*) AS c, MIN(Referer) FROM hits WHERE Referer <> '' GROUP BY k HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25;",
-        lambda x: (
-            x[x["Referer"] != ""]
-            .assign(k=x["Referer"].str.extract(r"^https?://(?:www\.)?([^/]+)/.*$")[0])
-            .groupby("k")
-            .filter(lambda g: g["Referer"].count() > 100000)
-            .agg(
-                min_referer=("Referer", "min"),
-                average_length=("Referer", lambda r: r.str.len().mean()),
-            )
-            .head(25)
-        ),
         lambda x: (
             x.filter(pl.col("Referer") != "")
             .with_columns(
@@ -363,109 +273,11 @@
     (
         "Q29",
         "SELECT SUM(ResolutionWidth), SUM(ResolutionWidth + 1), SUM(ResolutionWidth + 2), SUM(ResolutionWidth + 3), SUM(ResolutionWidth + 4), SUM(ResolutionWidth + 5), SUM(ResolutionWidth + 6), SUM(ResolutionWidth + 7), SUM(ResolutionWidth + 8), SUM(ResolutionWidth + 9), SUM(ResolutionWidth + 10), SUM(ResolutionWidth + 11), SUM(ResolutionWidth + 12), SUM(ResolutionWidth + 13), SUM(ResolutionWidth + 14), SUM(ResolutionWidth + 15), SUM(ResolutionWidth + 16), SUM(ResolutionWidth + 17), SUM(ResolutionWidth + 18), SUM(ResolutionWidth + 19), SUM(ResolutionWidth + 20), SUM(ResolutionWidth + 21), SUM(ResolutionWidth + 22), SUM(ResolutionWidth + 23), SUM(ResolutionWidth + 24), SUM(ResolutionWidth + 25), SUM(ResolutionWidth + 26), SUM(ResolutionWidth + 27), SUM(ResolutionWidth + 28), SUM(ResolutionWidth + 29), SUM(ResolutionWidth + 30), SUM(ResolutionWidth + 31), SUM(ResolutionWidth + 32), SUM(ResolutionWidth + 33), SUM(ResolutionWidth + 34), SUM(ResolutionWidth + 35), SUM(ResolutionWidth + 36), SUM(ResolutionWidth + 37), SUM(ResolutionWidth + 38), SUM(ResolutionWidth + 39), SUM(ResolutionWidth + 40), SUM(ResolutionWidth + 41), SUM(ResolutionWidth + 42), SUM(ResolutionWidth + 43), SUM(ResolutionWidth + 44), SUM(ResolutionWidth + 45), SUM(ResolutionWidth + 46), SUM(ResolutionWidth + 47), SUM(ResolutionWidth + 48), SUM(ResolutionWidth + 49), SUM(ResolutionWidth + 50), SUM(ResolutionWidth + 51), SUM(ResolutionWidth + 52), SUM(ResolutionWidth + 53), SUM(ResolutionWidth + 54), SUM(ResolutionWidth + 55), SUM(ResolutionWidth + 56), SUM(ResolutionWidth + 57), SUM(ResolutionWidth + 58), SUM(ResolutionWidth + 59), SUM(ResolutionWidth + 60), SUM(ResolutionWidth + 61), SUM(ResolutionWidth + 62), SUM(ResolutionWidth + 63), SUM(ResolutionWidth + 64), SUM(ResolutionWidth + 65), SUM(ResolutionWidth + 66), SUM(ResolutionWidth + 67), SUM(ResolutionWidth + 68), SUM(ResolutionWidth + 69), SUM(ResolutionWidth + 70), SUM(ResolutionWidth + 71), SUM(ResolutionWidth + 72), SUM(ResolutionWidth + 73), SUM(ResolutionWidth + 74), SUM(ResolutionWidth + 75), SUM(ResolutionWidth + 76), SUM(ResolutionWidth + 77), SUM(ResolutionWidth + 78), SUM(ResolutionWidth + 79), SUM(ResolutionWidth + 80), SUM(ResolutionWidth + 81), SUM(ResolutionWidth + 82), SUM(ResolutionWidth + 83), SUM(ResolutionWidth + 84), SUM(ResolutionWidth + 85), SUM(ResolutionWidth + 86), SUM(ResolutionWidth + 87), SUM(ResolutionWidth + 88), SUM(ResolutionWidth + 89) FROM hits;",
-        lambda x: x["ResolutionWidth"].sum()
-        + x["ResolutionWidth"].shift(1).sum()
-        + x["ResolutionWidth"].shift(2).sum()
-        + x["ResolutionWidth"].shift(3).sum()
-        + x["ResolutionWidth"].shift(4).sum()
-        + x["ResolutionWidth"].shift(5).sum()
-        + x["ResolutionWidth"].shift(6).sum()
-        + x["ResolutionWidth"].shift(7).sum()
-        + x["ResolutionWidth"].shift(8).sum()
-        + x["ResolutionWidth"].shift(9).sum()
-        + x["ResolutionWidth"].shift(10).sum()
-        + x["ResolutionWidth"].shift(11).sum()
-        + x["ResolutionWidth"].shift(12).sum()
-        + x["ResolutionWidth"].shift(13).sum()
-        + x["ResolutionWidth"].shift(14).sum()
-        + x["ResolutionWidth"].shift(15).sum()
-        + x["ResolutionWidth"].shift(16).sum()
-        + x["ResolutionWidth"].shift(17).sum()
-        + x["ResolutionWidth"].shift(18).sum()
-        + x["ResolutionWidth"].shift(19).sum()
-        + x["ResolutionWidth"].shift(20).sum()
-        + x["ResolutionWidth"].shift(21).sum()
-        + x["ResolutionWidth"].shift(22).sum()
-        + x["ResolutionWidth"].shift(23).sum()
-        + x["ResolutionWidth"].shift(24).sum()
-        + x["ResolutionWidth"].shift(25).sum()
-        + x["ResolutionWidth"].shift(26).sum()
-        + x["ResolutionWidth"].shift(27).sum()
-        + x["ResolutionWidth"].shift(28).sum()
-        + x["ResolutionWidth"].shift(29).sum()
-        + x["ResolutionWidth"].shift(30).sum()
-        + x["ResolutionWidth"].shift(31).sum()
-        + x["ResolutionWidth"].shift(32).sum()
-        + x["ResolutionWidth"].shift(33).sum()
-        + x["ResolutionWidth"].shift(34).sum()
-        + x["ResolutionWidth"].shift(35).sum()
-        + x["ResolutionWidth"].shift(36).sum()
-        + x["ResolutionWidth"].shift(37).sum()
-        + x["ResolutionWidth"].shift(38).sum()
-        + x["ResolutionWidth"].shift(39).sum()
-        + x["ResolutionWidth"].shift(40).sum()
-        + x["ResolutionWidth"].shift(41).sum()
-        + x["ResolutionWidth"].shift(42).sum()
-        + x["ResolutionWidth"].shift(43).sum()
-        + x["ResolutionWidth"].shift(44).sum()
-        + x["ResolutionWidth"].shift(45).sum()
-        + x["ResolutionWidth"].shift(46).sum()
-        + x["ResolutionWidth"].shift(47).sum()
-        + x["ResolutionWidth"].shift(48).sum()
-        + x["ResolutionWidth"].shift(49).sum()
-        + x["ResolutionWidth"].shift(50).sum()
-        + x["ResolutionWidth"].shift(51).sum()
-        + x["ResolutionWidth"].shift(52).sum()
-        + x["ResolutionWidth"].shift(53).sum()
-        + x["ResolutionWidth"].shift(54).sum()
-        + x["ResolutionWidth"].shift(55).sum()
-        + x["ResolutionWidth"].shift(56).sum()
-        + x["ResolutionWidth"].shift(57).sum()
-        + x["ResolutionWidth"].shift(58).sum()
-        + x["ResolutionWidth"].shift(59).sum()
-        + x["ResolutionWidth"].shift(60).sum()
-        + x["ResolutionWidth"].shift(61).sum()
-        + x["ResolutionWidth"].shift(62).sum()
-        + x["ResolutionWidth"].shift(63).sum()
-        + x["ResolutionWidth"].shift(64).sum()
-        + x["ResolutionWidth"].shift(65).sum()
-        + x["ResolutionWidth"].shift(66).sum()
-        + x["ResolutionWidth"].shift(67).sum()
-        + x["ResolutionWidth"].shift(68).sum()
-        + x["ResolutionWidth"].shift(69).sum()
-        + x["ResolutionWidth"].shift(70).sum()
-        + x["ResolutionWidth"].shift(71).sum()
-        + x["ResolutionWidth"].shift(72).sum()
-        + x["ResolutionWidth"].shift(73).sum()
-        + x["ResolutionWidth"].shift(74).sum()
-        + x["ResolutionWidth"].shift(75).sum()
-        + x["ResolutionWidth"].shift(76).sum()
-        + x["ResolutionWidth"].shift(77).sum()
-        + x["ResolutionWidth"].shift(78).sum()
-        + x["ResolutionWidth"].shift(79).sum()
-        + x["ResolutionWidth"].shift(80).sum()
-        + x["ResolutionWidth"].shift(81).sum()
-        + x["ResolutionWidth"].shift(82).sum()
-        + x["ResolutionWidth"].shift(83).sum()
-        + x["ResolutionWidth"].shift(84).sum()
-        + x["ResolutionWidth"].shift(85).sum()
-        + x["ResolutionWidth"].shift(86).sum()
-        + x["ResolutionWidth"].shift(87).sum()
-        + x["ResolutionWidth"].shift(88).sum()
-        + x["ResolutionWidth"].shift(89).sum(),
         lambda x: x.select(pl.sum_horizontal([pl.col("ResolutionWidth").shift(i) for i in range(1, 90)])).collect(),
     ),
     (
         "Q30",
         "SELECT SearchEngineID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits WHERE SearchPhrase <> '' GROUP BY SearchEngineID, ClientIP ORDER BY c DESC LIMIT 10;",
-        lambda x: x[x["SearchPhrase"] != ""]
-        .groupby(["SearchEngineID", "ClientIP"])
-        .agg(
-            c=("SearchEngineID", "size"),
-            IsRefreshSum=("IsRefresh", "sum"),
-            AvgResolutionWidth=("ResolutionWidth", "mean"),
-        )
-        .nlargest(10, "c"),
         lambda x: x.filter(pl.col("SearchPhrase") != "")
         .group_by(["SearchEngineID", "ClientIP"])
         .agg(
@@ -481,14 +293,6 @@
     (
         "Q31",
         "SELECT WatchID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits WHERE SearchPhrase <> '' GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10;",
-        lambda x: x[x["SearchPhrase"] != ""]
-        .groupby(["WatchID", "ClientIP"])
-        .agg(
-            c=("WatchID", "size"),
-            IsRefreshSum=("IsRefresh", "sum"),
-            AvgResolutionWidth=("ResolutionWidth", "mean"),
-        )
-        .nlargest(10, "c"),
         lambda x: x.filter(pl.col("SearchPhrase") != "")
         .group_by(["WatchID", "ClientIP"])
         .agg(
@@ -504,13 +308,6 @@
     (
         "Q32",
         "SELECT WatchID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10;",
-        lambda x: x.groupby(["WatchID", "ClientIP"])
-        .agg(
-            c=("WatchID", "size"),
-            IsRefreshSum=("IsRefresh", "sum"),
-            AvgResolutionWidth=("ResolutionWidth", "mean"),
-        )
-        .nlargest(10, "c"),
         lambda x: x.group_by(["WatchID", "ClientIP"])
         .agg(
             [
@@ -525,7 +322,6 @@
     (
         "Q33",
         "SELECT URL, COUNT(*) AS c FROM hits GROUP BY URL ORDER BY c DESC LIMIT 10;",
-        lambda x: x.groupby("URL").size().nlargest(10).reset_index(name="c"),
         lambda x: x.group_by("URL")
         .agg(pl.len().alias("c"))
         .sort("c", descending=True)
@@ -534,7 +330,6 @@
     (
         "Q34",
         "SELECT 1, URL, COUNT(*) AS c FROM hits GROUP BY 1, URL ORDER BY c DESC LIMIT 10;",
-        lambda x: x.groupby(["URL"]).size().nlargest(10).reset_index(name="c"),
         lambda x: x.group_by("URL")
         .agg(pl.len().alias("c"))
         .sort("c", descending=True)
@@ -543,15 +338,6 @@
     (
         "Q35",
         "SELECT ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3, COUNT(*) AS c FROM hits GROUP BY ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3 ORDER BY c DESC LIMIT 10;",
-        lambda x: x.assign(
-            **{f"ClientIP_minus_{i}": x["ClientIP"] - i for i in range(1, 4)}
-        )
-        .groupby(
-            ["ClientIP", "ClientIP_minus_1", "ClientIP_minus_2", "ClientIP_minus_3"]
-        )
-        .size()
-        .nlargest(10)
-        .reset_index(name="c"),
         lambda x: x.with_columns([pl.col("ClientIP")])
         .group_by(["ClientIP"])
         .agg(pl.len().alias("c"))
@@ -561,17 +347,6 @@
     (
         "Q36",
         "SELECT URL, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND DontCountHits = 0 AND IsRefresh = 0 AND URL <> '' GROUP BY URL ORDER BY PageViews DESC LIMIT 10;",
-        lambda x: x[
-            (x["CounterID"] == 62)
-            & (x["EventDate"] >= "2013-07-01")
-            & (x["EventDate"] <= "2013-07-31")
-            & (x["DontCountHits"] == 0)
-            & (x["IsRefresh"] == 0)
-            & (x["URL"] != "")
-        ]
-        .groupby("URL")
-        .size()
-        .nlargest(10),
         lambda x: x.filter(
             (pl.col("CounterID") == 62)
             & (pl.col("EventDate") >= datetime(2013, 7, 1))
@@ -588,17 +363,6 @@
     (
         "Q37",
         "SELECT Title, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND DontCountHits = 0 AND IsRefresh = 0 AND Title <> '' GROUP BY Title ORDER BY PageViews DESC LIMIT 10;",
-        lambda x: x[
-            (x["CounterID"] == 62)
-            & (x["EventDate"] >= "2013-07-01")
-            & (x["EventDate"] <= "2013-07-31")
-            & (x["DontCountHits"] == 0)
-            & (x["IsRefresh"] == 0)
-            & (x["Title"] != "")
-        ]
-        .groupby("Title")
-        .size()
-        .nlargest(10),
         lambda x: x.filter(
             (pl.col("CounterID") == 62)
             & (pl.col("EventDate") >= datetime(2013, 7, 1))
@@ -615,19 +379,6 @@
     (
         "Q38",
         "SELECT URL, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND IsLink <> 0 AND IsDownload = 0 GROUP BY URL ORDER BY PageViews DESC LIMIT 10 OFFSET 1000;",
-        lambda x: x[
-            (x["CounterID"] == 62)
-            & (x["EventDate"] >= "2013-07-01")
-            & (x["EventDate"] <= "2013-07-31")
-            & (x["IsRefresh"] == 0)
-            & (x["IsLink"] != 0)
-            & (x["IsDownload"] == 0)
-        ]
-        .groupby("URL")
-        .size()
-        .nlargest(10)
-        .reset_index(name="PageViews")
-        .iloc[1000:1010],
         lambda x: x.filter(
             (pl.col("CounterID") == 62)
             & (pl.col("EventDate") >= datetime(2013, 7, 1))
@@ -644,60 +395,31 @@
     (
         "Q39",
         "SELECT TraficSourceID, SearchEngineID, AdvEngineID, CASE WHEN (SearchEngineID = 0 AND AdvEngineID = 0) THEN Referer ELSE '' END AS Src, URL AS Dst, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 GROUP BY TraficSourceID, SearchEngineID, AdvEngineID, Src, Dst ORDER BY PageViews DESC LIMIT 10 OFFSET 1000;",
-        lambda x: x[
-            (x["CounterID"] == 62)
-            & (x["EventDate"] >= "2013-07-01")
-            & (x["EventDate"] <= "2013-07-31")
-            & (x["IsRefresh"] == 0)
-        ]
-        .groupby(["TraficSourceID", "SearchEngineID", "AdvEngineID", "Referer", "URL"])
-        .size()
-        .nlargest(10)
-        .reset_index(name="PageViews")
-        .iloc[1000:1010],
-        lambda x: None,
-        # Crash with:
-        #   thread '<unnamed>' panicked at crates/polars-time/src/windows/duration.rs:215:21:
-        #   expected leading integer in the duration string, found m
-        #   note: run with `RUST_BACKTRACE=1` environment variable to display a backtrace
-        # lambda x: x.filter(
-        #     (pl.col("CounterID") == 62)
-        #     & (pl.col("EventDate") >= datetime(2013, 7, 1))
-        #     & (pl.col("EventDate") <= datetime(2013, 7, 31))
-        #     & (pl.col("IsRefresh") == 0)
-        # )
-        # .group_by(
-        #     [
-        #         "TraficSourceID",
-        #         "SearchEngineID",
-        #         "AdvEngineID",
-        #         # pl.when(pl.col("SearchEngineID").eq(0) & pl.col("AdvEngineID").eq(0))
-        #         # .then(pl.col("Referer"))
-        #         # .otherwise("")
-        #         # .alias("Src"),
-        #         "URL",
-        #     ]
-        # )
-        # .agg(pl.len().alias("PageViews"))
-        # .sort("PageViews", descending=True)
-        # .slice(1000, 10),
+        lambda x: x.filter(
+            (pl.col("CounterID") == 62)
+            & (pl.col("EventDate") >= datetime(2013, 7, 1))
+            & (pl.col("EventDate") <= datetime(2013, 7, 31))
+            & (pl.col("IsRefresh") == 0)
+        )
+        .group_by(
+            [
+                "TraficSourceID",
+                "SearchEngineID",
+                "AdvEngineID",
+                pl.when(pl.col("SearchEngineID").eq(0) & pl.col("AdvEngineID").eq(0))
+                .then(pl.col("Referer"))
+                .otherwise("")
+                .alias("Src"),
+                "URL",
+            ]
+        )
+        .agg(pl.len().alias("PageViews"))
+        .sort("PageViews", descending=True)
+        .slice(1000, 10),
     ),
     (
         "Q40",
         "SELECT URLHash, EventDate, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND TraficSourceID IN (-1, 6) AND RefererHash = 3594120000172545465 GROUP BY URLHash, EventDate ORDER BY PageViews DESC LIMIT 10 OFFSET 100;",
-        lambda x: x[
-            (x["CounterID"] == 62)
-            & (x["EventDate"] >= "2013-07-01")
-            & (x["EventDate"] <= "2013-07-31")
-            & (x["IsRefresh"] == 0)
-            & (x["TraficSourceID"].isin([-1, 6]))
-            & (x["RefererHash"] == 3594120000172545465)
-        ]
-        .groupby(["URLHash", "EventDate"])
-        .size()
-        .nlargest(10)
-        .reset_index(name="PageViews")
-        .iloc[100:110],
         lambda x: x.filter(
             (pl.col("CounterID") == 62)
             & (pl.col("EventDate") >= datetime(2013, 7, 1))
@@ -714,19 +436,6 @@
     (
         "Q41",
         "SELECT WindowClientWidth, WindowClientHeight, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND DontCountHits = 0 AND URLHash = 2868770270353813622 GROUP BY WindowClientWidth, WindowClientHeight ORDER BY PageViews DESC LIMIT 10 OFFSET 10000;",
-        lambda x: x[
-            (x["CounterID"] == 62)
-            & (x["EventDate"] >= "2013-07-01")
-            & (x["EventDate"] <= "2013-07-31")
-            & (x["IsRefresh"] == 0)
-            & (x["DontCountHits"] == 0)
-            & (x["URLHash"] == 2868770270353813622)
-        ]
-        .groupby(["WindowClientWidth", "WindowClientHeight"])
-        .size()
-        .nlargest(10)
-        .reset_index(name="PageViews")
-        .iloc[10000:10010],
         lambda x: x.filter(
             (pl.col("CounterID") == 62)
             & (pl.col("EventDate") >= datetime(2013, 7, 1))
@@ -743,31 +452,17 @@
     (
         "Q42",
         "SELECT DATE_TRUNC('minute', EventTime) AS M, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-14' AND EventDate <= '2013-07-15' AND IsRefresh = 0 AND DontCountHits = 0 GROUP BY DATE_TRUNC('minute', EventTime) ORDER BY DATE_TRUNC('minute', EventTime) LIMIT 10 OFFSET 1000;",
-        lambda x: x[
-            (x["CounterID"] == 62)
-            & (x["EventDate"] >= "2013-07-14")
-            & (x["EventDate"] <= "2013-07-15")
-            & (x["IsRefresh"] == 0)
-            & (x["DontCountHits"] == 0)
-        ]
-        .groupby(pd.Grouper(key="EventTime", freq="T"))
-        .size()
-        .reset_index(name="PageViews")
-        .iloc[1000:1010],
         lambda x: None,
-        # Crash with:
-        #   thread '<unnamed>' panicked at crates/polars-time/src/windows/duration.rs:215:21:
-        #   expected leading integer in the duration string, found m
-        # lambda x: x.filter(
-        #     (pl.col("CounterID") == 62)
-        #     & (pl.col("EventDate") >= datetime(2013, 7, 14))
-        #     & (pl.col("EventDate") <= datetime(2013, 7, 15))
-        #     & (pl.col("IsRefresh") == 0)
-        #     & (pl.col("DontCountHits") == 0)
-        # )
-        # .group_by(pl.col("EventTime").dt.truncate("minute"))
-        # .agg(pl.len().alias("PageViews"))
-        # .slice(1000, 10),
+        lambda x: x.filter(
+            (pl.col("CounterID") == 62)
+            & (pl.col("EventDate") >= datetime(2013, 7, 14))
+            & (pl.col("EventDate") <= datetime(2013, 7, 15))
+            & (pl.col("IsRefresh") == 0)
+            & (pl.col("DontCountHits") == 0)
+        )
+        .group_by(pl.col("EventTime").dt.truncate("minute"))
+        .agg(pl.len().alias("PageViews"))
+        .slice(1000, 10),
     ),
 ]
 
@@ -776,7 +471,7 @@
     times = []
     for _ in range(3):
         start = timeit.default_timer()
-        result = q[3](pl_df)
+        result = q[2](lf)
         end = timeit.default_timer()
         if result is None:
             times.append(None)
@@ -791,15 +486,11 @@
     "cluster_size": 1,
     "comment": "",
     "tags": [
-        "C++",
         "column-oriented",
-        "embedded",
-        "stateless",
-        "serverless",
         "dataframe",
     ],
     "load_time": float(load_time),
-    "data_size": int(dataframe_size),
+    "data_size": int(lf.collect().estimated_size()),
     "result": queries_times,
 }
 

From 5b262bf9371a670fe4e8112655e147add79c93ac Mon Sep 17 00:00:00 2001
From: ritchie <ritchie46@gmail.com>
Date: Tue, 26 Nov 2024 08:32:08 +0100
Subject: [PATCH 05/10] always update to latest

---
 polars/benchmark.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/polars/benchmark.sh b/polars/benchmark.sh
index 10ad7e61f..1cda7badb 100755
--- a/polars/benchmark.sh
+++ b/polars/benchmark.sh
@@ -4,7 +4,7 @@
 
 sudo apt-get update
 sudo apt-get install -y python3-pip
-pip install --break-system-packages polars
+pip install -U --break-system-packages polars
 
 # Download the data
 wget --no-verbose --continue https://datasets.clickhouse.com/hits_compatible/athena/hits.parquet

From d12bfb69524b48ab210122140d7242aae5941cb7 Mon Sep 17 00:00:00 2001
From: ritchie <ritchie46@gmail.com>
Date: Tue, 26 Nov 2024 08:41:17 +0100
Subject: [PATCH 06/10] fix all queries

---
 polars/query.py | 39 ++++++++++++++++++++-------------------
 1 file changed, 20 insertions(+), 19 deletions(-)

diff --git a/polars/query.py b/polars/query.py
index feb798f26..ad9ed30ba 100755
--- a/polars/query.py
+++ b/polars/query.py
@@ -2,7 +2,7 @@
 
 import polars as pl
 import timeit
-from datetime import datetime, date
+from datetime import date
 import json
 
 start = timeit.default_timer()
@@ -13,7 +13,7 @@
 
 # fix some types
 df = df.with_columns(
-    (pl.col("EventTime") * 1000).cast(pl.Datetime(time_unit="ms")),
+    (pl.col("EventTime") * int(1e6)).cast(pl.Datetime(time_unit="us")),
     pl.col("EventDate").cast(pl.Date),
 )
 assert df["EventTime"][0].year == 2013
@@ -349,8 +349,8 @@
         "SELECT URL, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND DontCountHits = 0 AND IsRefresh = 0 AND URL <> '' GROUP BY URL ORDER BY PageViews DESC LIMIT 10;",
         lambda x: x.filter(
             (pl.col("CounterID") == 62)
-            & (pl.col("EventDate") >= datetime(2013, 7, 1))
-            & (pl.col("EventDate") <= datetime(2013, 7, 31))
+            & (pl.col("EventDate") >= date(2013, 7, 1))
+            & (pl.col("EventDate") <= date(2013, 7, 31))
             & (pl.col("DontCountHits") == 0)
             & (pl.col("IsRefresh") == 0)
             & (pl.col("URL") != "")
@@ -365,8 +365,8 @@
         "SELECT Title, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND DontCountHits = 0 AND IsRefresh = 0 AND Title <> '' GROUP BY Title ORDER BY PageViews DESC LIMIT 10;",
         lambda x: x.filter(
             (pl.col("CounterID") == 62)
-            & (pl.col("EventDate") >= datetime(2013, 7, 1))
-            & (pl.col("EventDate") <= datetime(2013, 7, 31))
+            & (pl.col("EventDate") >= date(2013, 7, 1))
+            & (pl.col("EventDate") <= date(2013, 7, 31))
             & (pl.col("DontCountHits") == 0)
             & (pl.col("IsRefresh") == 0)
             & (pl.col("Title") != "")
@@ -381,8 +381,8 @@
         "SELECT URL, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND IsLink <> 0 AND IsDownload = 0 GROUP BY URL ORDER BY PageViews DESC LIMIT 10 OFFSET 1000;",
         lambda x: x.filter(
             (pl.col("CounterID") == 62)
-            & (pl.col("EventDate") >= datetime(2013, 7, 1))
-            & (pl.col("EventDate") <= datetime(2013, 7, 31))
+            & (pl.col("EventDate") >= date(2013, 7, 1))
+            & (pl.col("EventDate") <= date(2013, 7, 31))
             & (pl.col("IsRefresh") == 0)
             & (pl.col("IsLink") != 0)
             & (pl.col("IsDownload") == 0)
@@ -397,8 +397,8 @@
         "SELECT TraficSourceID, SearchEngineID, AdvEngineID, CASE WHEN (SearchEngineID = 0 AND AdvEngineID = 0) THEN Referer ELSE '' END AS Src, URL AS Dst, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 GROUP BY TraficSourceID, SearchEngineID, AdvEngineID, Src, Dst ORDER BY PageViews DESC LIMIT 10 OFFSET 1000;",
         lambda x: x.filter(
             (pl.col("CounterID") == 62)
-            & (pl.col("EventDate") >= datetime(2013, 7, 1))
-            & (pl.col("EventDate") <= datetime(2013, 7, 31))
+            & (pl.col("EventDate") >= date(2013, 7, 1))
+            & (pl.col("EventDate") <= date(2013, 7, 31))
             & (pl.col("IsRefresh") == 0)
         )
         .group_by(
@@ -408,22 +408,22 @@
                 "AdvEngineID",
                 pl.when(pl.col("SearchEngineID").eq(0) & pl.col("AdvEngineID").eq(0))
                 .then(pl.col("Referer"))
-                .otherwise("")
+                .otherwise(pl.lit(""))
                 .alias("Src"),
                 "URL",
             ]
         )
         .agg(pl.len().alias("PageViews"))
         .sort("PageViews", descending=True)
-        .slice(1000, 10),
+        .slice(1000, 10).collect(),
     ),
     (
         "Q40",
         "SELECT URLHash, EventDate, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND TraficSourceID IN (-1, 6) AND RefererHash = 3594120000172545465 GROUP BY URLHash, EventDate ORDER BY PageViews DESC LIMIT 10 OFFSET 100;",
         lambda x: x.filter(
             (pl.col("CounterID") == 62)
-            & (pl.col("EventDate") >= datetime(2013, 7, 1))
-            & (pl.col("EventDate") <= datetime(2013, 7, 31))
+            & (pl.col("EventDate") >= date(2013, 7, 1))
+            & (pl.col("EventDate") <= date(2013, 7, 31))
             & (pl.col("IsRefresh") == 0)
             & (pl.col("TraficSourceID").is_in([-1, 6]))
             & (pl.col("RefererHash") == 3594120000172545465)
@@ -438,8 +438,8 @@
         "SELECT WindowClientWidth, WindowClientHeight, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND DontCountHits = 0 AND URLHash = 2868770270353813622 GROUP BY WindowClientWidth, WindowClientHeight ORDER BY PageViews DESC LIMIT 10 OFFSET 10000;",
         lambda x: x.filter(
             (pl.col("CounterID") == 62)
-            & (pl.col("EventDate") >= datetime(2013, 7, 1))
-            & (pl.col("EventDate") <= datetime(2013, 7, 31))
+            & (pl.col("EventDate") >= date(2013, 7, 1))
+            & (pl.col("EventDate") <= date(2013, 7, 31))
             & (pl.col("IsRefresh") == 0)
             & (pl.col("DontCountHits") == 0)
             & (pl.col("URLHash") == 2868770270353813622)
@@ -455,19 +455,20 @@
         lambda x: None,
         lambda x: x.filter(
             (pl.col("CounterID") == 62)
-            & (pl.col("EventDate") >= datetime(2013, 7, 14))
-            & (pl.col("EventDate") <= datetime(2013, 7, 15))
+            & (pl.col("EventDate") >= date(2013, 7, 14))
+            & (pl.col("EventDate") <= date(2013, 7, 15))
             & (pl.col("IsRefresh") == 0)
             & (pl.col("DontCountHits") == 0)
         )
         .group_by(pl.col("EventTime").dt.truncate("minute"))
         .agg(pl.len().alias("PageViews"))
-        .slice(1000, 10),
+        .slice(1000, 10).collect(),
     ),
 ]
 
 queries_times = []
 for q in queries:
+    print(q[0])
     times = []
     for _ in range(3):
         start = timeit.default_timer()

From f0d18911943a4da82c7fe0a2d73fa1292e98bfc8 Mon Sep 17 00:00:00 2001
From: ritchie <ritchie46@gmail.com>
Date: Tue, 26 Nov 2024 08:48:55 +0100
Subject: [PATCH 07/10] don't do unicode regex

---
 polars/query.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/polars/query.py b/polars/query.py
index ad9ed30ba..b92ed215a 100755
--- a/polars/query.py
+++ b/polars/query.py
@@ -6,7 +6,8 @@
 import json
 
 start = timeit.default_timer()
-df = pl.scan_parquet("hits.parquet").collect()
+# df = pl.scan_parquet("hits.parquet").collect()
+df = pl.read_parquet("hits.parquet", n_rows=int(1e7))
 stop = timeit.default_timer()
 load_time = stop - start
 
@@ -249,7 +250,7 @@
     ),
     (
         "Q28",
-        "SELECT REGEXP_REPLACE(Referer, '^https?://(?:www\\.)?([^/]+)/.*$', '\\1') AS k, AVG(STRLEN(Referer)) AS l, COUNT(*) AS c, MIN(Referer) FROM hits WHERE Referer <> '' GROUP BY k HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25;",
+        "SELECT REGEXP_REPLACE(Referer, '(?-u)^https?://(?:www\\.)?([^/]+)/.*$', '\\1') AS k, AVG(STRLEN(Referer)) AS l, COUNT(*) AS c, MIN(Referer) FROM hits WHERE Referer <> '' GROUP BY k HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25;",
         lambda x: (
             x.filter(pl.col("Referer") != "")
             .with_columns(

From c4613429ec1c6ee43f1643ce8404da731e0b9bf5 Mon Sep 17 00:00:00 2001
From: ritchie <ritchie46@gmail.com>
Date: Tue, 26 Nov 2024 09:02:59 +0100
Subject: [PATCH 08/10] undo slice

---
 polars/query.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/polars/query.py b/polars/query.py
index b92ed215a..82a97aa01 100755
--- a/polars/query.py
+++ b/polars/query.py
@@ -6,8 +6,7 @@
 import json
 
 start = timeit.default_timer()
-# df = pl.scan_parquet("hits.parquet").collect()
-df = pl.read_parquet("hits.parquet", n_rows=int(1e7))
+df = pl.scan_parquet("hits.parquet").collect()
 stop = timeit.default_timer()
 load_time = stop - start
 

From 5b510928e89d1944806e973c4874b28c84b28cad Mon Sep 17 00:00:00 2001
From: ritchie <ritchie46@gmail.com>
Date: Thu, 28 Nov 2024 15:26:46 +0100
Subject: [PATCH 09/10] add parquet source

---
 polars/query.py | 120 ++++++++++++++++++++++++++----------------------
 1 file changed, 65 insertions(+), 55 deletions(-)

diff --git a/polars/query.py b/polars/query.py
index 82a97aa01..347c6e662 100755
--- a/polars/query.py
+++ b/polars/query.py
@@ -5,23 +5,7 @@
 from datetime import date
 import json
 
-start = timeit.default_timer()
-df = pl.scan_parquet("hits.parquet").collect()
-stop = timeit.default_timer()
-load_time = stop - start
-
-
-# fix some types
-df = df.with_columns(
-    (pl.col("EventTime") * int(1e6)).cast(pl.Datetime(time_unit="us")),
-    pl.col("EventDate").cast(pl.Date),
-)
-assert df["EventTime"][0].year == 2013
-df = df.rechunk()
-
-lf = df.lazy()
-
-# 0: No., 1: SQL, 3: Polars
+# 0: No., 1: SQL, 2: Polars
 queries = [
     ("Q0", "SELECT COUNT(*) FROM hits;", lambda x: x.select(pl.len()).collect().height),
     (
@@ -452,7 +436,6 @@
     (
         "Q42",
         "SELECT DATE_TRUNC('minute', EventTime) AS M, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-14' AND EventDate <= '2013-07-15' AND IsRefresh = 0 AND DontCountHits = 0 GROUP BY DATE_TRUNC('minute', EventTime) ORDER BY DATE_TRUNC('minute', EventTime) LIMIT 10 OFFSET 1000;",
-        lambda x: None,
         lambda x: x.filter(
             (pl.col("CounterID") == 62)
             & (pl.col("EventDate") >= date(2013, 7, 14))
@@ -460,47 +443,74 @@
             & (pl.col("IsRefresh") == 0)
             & (pl.col("DontCountHits") == 0)
         )
-        .group_by(pl.col("EventTime").dt.truncate("minute"))
+        .group_by(pl.col("EventTime").dt.truncate("1m"))
         .agg(pl.len().alias("PageViews"))
         .slice(1000, 10).collect(),
     ),
 ]
 
-queries_times = []
-for q in queries:
-    print(q[0])
-    times = []
-    for _ in range(3):
-        start = timeit.default_timer()
-        result = q[2](lf)
-        end = timeit.default_timer()
-        if result is None:
-            times.append(None)
-        else:
-            times.append(end - start)
-    queries_times.append(times)
 
-result_json = {
-    "system": "Polars (DataFrame)",
-    "date": date.today().strftime("%Y-%m-%d"),
-    "machine": "c6a.metal, 500gb gp2",
-    "cluster_size": 1,
-    "comment": "",
-    "tags": [
-        "column-oriented",
-        "dataframe",
-    ],
-    "load_time": float(load_time),
-    "data_size": int(lf.collect().estimated_size()),
-    "result": queries_times,
-}
+def run_timings(lf: pl.LazyFrame, name: str, src: str, load_time: int | None) -> None:
+    queries_times = []
+    for q in queries:
+        print(q[0])
+        times = []
+        for _ in range(3):
+            start = timeit.default_timer()
+            result = q[2](lf)
+            end = timeit.default_timer()
+            if result is None:
+                times.append(None)
+            else:
+                times.append(end - start)
+        queries_times.append(times)
+
+    result_json = {
+        "system": name,
+        "date": date.today().strftime("%Y-%m-%d"),
+        "machine": "c6a.metal, 500gb gp2",
+        "cluster_size": 1,
+        "comment": "",
+        "tags": [
+            "column-oriented",
+            src,
+        ],
+        "load_time": float(load_time) if load_time is not None else None,
+        "result": queries_times,
+    }
+    # if cpuinfo contains "AMD EPYC 9654" update machine and write result into results/epyc-9654.json
+    if "AMD EPYC 9654" in open("/proc/cpuinfo").read():
+        result_json["machine"] = "EPYC 9654, 384G"
+        with open(f"results/{src}_epyc-9654.json", "w") as f:
+            f.write(json.dumps(result_json, indent=4))
+    else:
+        # write result into results/c6a.metal.json
+        with open(f"results/{src}_c6a.metal.json", "w") as f:
+            f.write(json.dumps(result_json, indent=4))
 
-# if cpuinfo contains "AMD EPYC 9654" update machine and write result into results/epyc-9654.json
-if "AMD EPYC 9654" in open("/proc/cpuinfo").read():
-    result_json["machine"] = "EPYC 9654, 384G"
-    with open("results/epyc-9654.json", "w") as f:
-        f.write(json.dumps(result_json, indent=4))
-else:
-    # write result into results/c6a.metal.json
-    with open("results/c6a.metal.json", "w") as f:
-        f.write(json.dumps(result_json, indent=4))
+
+# Run from Parquet
+lf = pl.scan_parquet("hits.parquet").with_columns(
+    (pl.col("EventTime") * int(1e6)).cast(pl.Datetime(time_unit="us")),
+    pl.col("EventDate").cast(pl.Date),
+)
+print("run parquet queries")
+run_timings(lf, "Polars (Parquet)", "parquet", None)
+
+
+print("run DataFrame (in-memory) queries, this loads all data in memory!")
+start = timeit.default_timer()
+df = pl.scan_parquet("hits.parquet").collect()
+stop = timeit.default_timer()
+load_time = stop - start
+
+# fix some types
+df = df.with_columns(
+    (pl.col("EventTime") * int(1e6)).cast(pl.Datetime(time_unit="us")),
+    pl.col("EventDate").cast(pl.Date),
+)
+assert df["EventTime"][0].year == 2013
+df = df.rechunk()
+
+lf = df.lazy()
+run_timings(lf, "Polars (DataFrame)", "DataFrame", load_time)

From 1d744275f035f9b4d2e8b0d98f77e472f438bb71 Mon Sep 17 00:00:00 2001
From: Robert Schulze <robert@clickhouse.com>
Date: Fri, 29 Nov 2024 11:41:40 +0000
Subject: [PATCH 10/10] Add results for c6a.metal

---
 polars/results/DataFrame_c6a.metal.json | 229 +++++++++++++++++++++++
 polars/results/c6a.metal.json           | 234 ------------------------
 polars/results/parquet_c6a.metal.json   | 229 +++++++++++++++++++++++
 3 files changed, 458 insertions(+), 234 deletions(-)
 create mode 100644 polars/results/DataFrame_c6a.metal.json
 delete mode 100644 polars/results/c6a.metal.json
 create mode 100644 polars/results/parquet_c6a.metal.json

diff --git a/polars/results/DataFrame_c6a.metal.json b/polars/results/DataFrame_c6a.metal.json
new file mode 100644
index 000000000..f214f8d26
--- /dev/null
+++ b/polars/results/DataFrame_c6a.metal.json
@@ -0,0 +1,229 @@
+{
+    "system": "Polars (DataFrame)",
+    "date": "2024-11-29",
+    "machine": "c6a.metal, 500gb gp2",
+    "cluster_size": 1,
+    "comment": "",
+    "tags": [
+        "column-oriented",
+        "DataFrame"
+    ],
+    "load_time": 2.3000679460001265,
+    "result": [
+        [
+            0.00015112200003386533,
+            3.7150000025576446e-05,
+            2.554100001361803e-05
+        ],
+        [
+            0.03171516099996552,
+            0.032293901000002734,
+            0.03215773800002353
+        ],
+        [
+            0.4772652639999251,
+            0.46750544699989405,
+            0.4495371789998899
+        ],
+        [
+            0.0710227990000476,
+            0.07084154700010004,
+            0.0707792650000556
+        ],
+        [
+            2.057352185000127,
+            1.7546763299999384,
+            1.163346606999994
+        ],
+        [
+            3.3611794670000563,
+            3.361934698999903,
+            3.356456425000033
+        ],
+        [
+            0.022322305999978198,
+            0.017734573999860004,
+            0.017439570000078675
+        ],
+        [
+            0.034810428999890064,
+            0.03980527599992456,
+            0.037032382999996116
+        ],
+        [
+            1.1927395830000478,
+            0.9333954369999446,
+            0.986558266999964
+        ],
+        [
+            1.1237861500001145,
+            1.0634325710000212,
+            1.2359178669998983
+        ],
+        [
+            0.4698280170000544,
+            0.4857669729999543,
+            0.46914764600001035
+        ],
+        [
+            0.4963598159999947,
+            0.5227434319999702,
+            0.42942682499983675
+        ],
+        [
+            0.6119570860000749,
+            0.6307538849998764,
+            0.6839730260001033
+        ],
+        [
+            0.6959008190001441,
+            0.7254145639999479,
+            0.766673179999998
+        ],
+        [
+            0.750602372000003,
+            0.7139416769998661,
+            0.6843765409998923
+        ],
+        [
+            0.8596073270000488,
+            0.9934823169999163,
+            0.9173948990001008
+        ],
+        [
+            2.7692482420000033,
+            2.3495000699999764,
+            2.2436599499999375
+        ],
+        [
+            2.026880264000056,
+            1.9250255050001215,
+            1.9470105970001441
+        ],
+        [
+            4.853015905999882,
+            4.480639889000031,
+            4.586371847999999
+        ],
+        [
+            0.065807199000119,
+            0.06525409100004254,
+            0.06565585699991061
+        ],
+        [
+            1.9282573609998508,
+            1.9262364920000437,
+            1.9274114049999298
+        ],
+        [
+            2.1353811700000733,
+            2.135909988999856,
+            2.139513435000026
+        ],
+        [
+            4.494763337999984,
+            4.506743129999904,
+            4.513714225000058
+        ],
+        [
+            1.9703748730000825,
+            1.9624855600000046,
+            1.9467992569998387
+        ],
+        [
+            0.5281160150000233,
+            0.5079870420001953,
+            0.5058492079999724
+        ],
+        [
+            0.9466686680000294,
+            0.9467454900000121,
+            0.9417365909998807
+        ],
+        [
+            1.6252639049998834,
+            1.6136391739998999,
+            1.6105059249998703
+        ],
+        [
+            3.961973265999859,
+            3.929348490999928,
+            4.038061261999928
+        ],
+        [
+            57.451627619999954,
+            57.806593514999804,
+            58.076217272999884
+        ],
+        [
+            1.6579584300000079,
+            1.5953056620001007,
+            1.604066080000166
+        ],
+        [
+            0.787072875999911,
+            0.805590096999822,
+            0.7736930999999458
+        ],
+        [
+            1.040926324000111,
+            1.09222987499993,
+            0.9836310939999748
+        ],
+        [
+            6.117114711999875,
+            5.623454530000117,
+            5.640725126999996
+        ],
+        [
+            3.3983772260000933,
+            3.133559688999867,
+            3.2032901739999033
+        ],
+        [
+            3.296611066999958,
+            3.1726310689998627,
+            3.1984622950001267
+        ],
+        [
+            0.5767455929999414,
+            0.4501746359999288,
+            0.5369838189999427
+        ],
+        [
+            0.3523692209998899,
+            0.30068239000001995,
+            0.29913635500020064
+        ],
+        [
+            0.7507235030000174,
+            0.3421970209999472,
+            0.3525650139999925
+        ],
+        [
+            0.2449839159999101,
+            0.23656309300008616,
+            0.23265344200012805
+        ],
+        [
+            0.25074233399982404,
+            0.26460407999979907,
+            0.25035662799996317
+        ],
+        [
+            1.4510024630001226,
+            1.184202398000025,
+            1.171343846000127
+        ],
+        [
+            0.5013381270000536,
+            0.28769277099991086,
+            0.28513832199996614
+        ],
+        [
+            0.21341560199994092,
+            0.2118193969999993,
+            0.2055801789999805
+        ]
+    ]
+}
\ No newline at end of file
diff --git a/polars/results/c6a.metal.json b/polars/results/c6a.metal.json
deleted file mode 100644
index 97e711221..000000000
--- a/polars/results/c6a.metal.json
+++ /dev/null
@@ -1,234 +0,0 @@
-{
-    "system": "Polars (DataFrame)",
-    "date": "2024-09-09",
-    "machine": "c6a.metal, 500gb gp2",
-    "cluster_size": 1,
-    "comment": "",
-    "tags": [
-        "C++",
-        "column-oriented",
-        "embedded",
-        "stateless",
-        "serverless",
-        "dataframe"
-    ],
-    "load_time": 274.5956620259999,
-    "data_size": 46998823722,
-    "result": [
-        [
-            2.7551000130188186e-05,
-            2.0500001483014785e-06,
-            3.8999996831989847e-07
-        ],
-        [
-            0.3393856870000036,
-            0.08128961900001741,
-            0.0792162879999978
-        ],
-        [
-            0.46830895299990516,
-            0.4213311490000251,
-            0.41986769899995124
-        ],
-        [
-            0.27111638099995616,
-            0.0691876239998237,
-            0.06879308600014156
-        ],
-        [
-            0.9961474940000699,
-            1.1656686840001385,
-            1.341034622000052
-        ],
-        [
-            3.293661254999961,
-            3.3244774100000996,
-            3.31177472100012
-        ],
-        [
-            0.05548804800014295,
-            0.05496264599992173,
-            0.05497070699993856
-        ],
-        [
-            0.30676389199993537,
-            0.11210982800002967,
-            0.10042662099999689
-        ],
-        [
-            5.758702494999852,
-            1.412199709000106,
-            0.9961499670000649
-        ],
-        [
-            1.269696352999972,
-            1.0784102180000446,
-            1.089092165000011
-        ],
-        [
-            1.5691383170001245,
-            0.7789256089999981,
-            0.7921542070000669
-        ],
-        [
-            1.0894663530000344,
-            0.9373611309999887,
-            0.8721730039999329
-        ],
-        [
-            2.494931741999835,
-            1.7917862040001182,
-            1.7991658439998446
-        ],
-        [
-            95.30520302599984,
-            96.30335870499971,
-            95.42217894499981
-        ],
-        [
-            1.9459286860001157,
-            1.9641287069998725,
-            1.9325827739999113
-        ],
-        [
-            1.4602280210001481,
-            1.2033372910000253,
-            1.3688985580001827
-        ],
-        [
-            3.3947668380001232,
-            2.9921093740003926,
-            2.996665767000195
-        ],
-        [
-            2.16839200000004,
-            2.5575646139996024,
-            2.477013052000075
-        ],
-        [
-            5.922572512999977,
-            6.192915480000011,
-            7.058452599999782
-        ],
-        [
-            0.21617201800017938,
-            0.051977289999740606,
-            0.05087722700000086
-        ],
-        [
-            6.605031917999895,
-            5.489592963999712,
-            3.316997033000007
-        ],
-        [
-            2.9719723640000666,
-            1.9411770279998564,
-            1.9698260360000859
-        ],
-        [
-            5.622727766000025,
-            4.052559812999789,
-            4.165921265999714
-        ],
-        [
-            2.1939042029998745,
-            2.0240198039996358,
-            2.0558673149998867
-        ],
-        [
-            3.2231196239999917,
-            2.574916228999882,
-            2.6234278580000137
-        ],
-        [
-            3.1981313249998493,
-            3.2646530639999582,
-            3.29449439300015
-        ],
-        [
-            2.607552430999931,
-            2.5564862259998336,
-            2.6211176460001298
-        ],
-        [
-            8.782033818999935,
-            9.022877838000113,
-            8.618930766000176
-        ],
-        [
-            65.78594618099987,
-            65.80185137000035,
-            65.21734767399994
-        ],
-        [
-            0.04148712999995041,
-            0.00017722400025377283,
-            2.6159999833907932e-05
-        ],
-        [
-            5.197321627000292,
-            1.8464567339997302,
-            1.8262284829997952
-        ],
-        [
-            2.020474259999901,
-            2.1032231610001872,
-            2.150142378000055
-        ],
-        [
-            6.235942041999806,
-            8.480420082000364,
-            7.408725919000062
-        ],
-        [
-            3.7926399410002887,
-            3.562234329000148,
-            3.638766590999694
-        ],
-        [
-            3.3556073149998156,
-            3.7019931729996642,
-            3.4804760279998845
-        ],
-        [
-            0.5585857320002106,
-            0.6201126340001792,
-            0.6397678640000777
-        ],
-        [
-            20.811287902000004,
-            20.985524244000317,
-            20.879136653000387
-        ],
-        [
-            18.704310231999898,
-            19.610116691000258,
-            18.715476107000086
-        ],
-        [
-            7.068463735000023,
-            7.329527066000082,
-            7.057720248999885
-        ],
-        [
-            null,
-            null,
-            null
-        ],
-        [
-            2.366720836999775,
-            2.364191364999897,
-            2.4704000149999956
-        ],
-        [
-            0.06867151899996315,
-            0.06851112599997577,
-            0.09032966999984637
-        ],
-        [
-            null,
-            null,
-            null
-        ]
-    ]
-}
\ No newline at end of file
diff --git a/polars/results/parquet_c6a.metal.json b/polars/results/parquet_c6a.metal.json
new file mode 100644
index 000000000..b75ce3680
--- /dev/null
+++ b/polars/results/parquet_c6a.metal.json
@@ -0,0 +1,229 @@
+{
+    "system": "Polars (Parquet)",
+    "date": "2024-11-29",
+    "machine": "c6a.metal, 500gb gp2",
+    "cluster_size": 1,
+    "comment": "",
+    "tags": [
+        "column-oriented",
+        "parquet"
+    ],
+    "load_time": null,
+    "result": [
+        [
+            0.07331929400004356,
+            0.0313942400000542,
+            0.0295857439999736
+        ],
+        [
+            0.056858242999965114,
+            0.03603707900003883,
+            0.03635798200002682
+        ],
+        [
+            0.3601835890000302,
+            0.12495236099994145,
+            0.36494900799993957
+        ],
+        [
+            0.1014015370000152,
+            0.10379579100003866,
+            0.09615968900004646
+        ],
+        [
+            1.8475413349999599,
+            1.2111640870000429,
+            1.3070971719999989
+        ],
+        [
+            3.073540879999996,
+            2.6459892290000653,
+            2.6156991359999893
+        ],
+        [
+            0.06517877500004943,
+            0.053004126000018914,
+            0.05571083600000293
+        ],
+        [
+            0.026876204000018333,
+            0.015052019999984623,
+            0.013367295999955786
+        ],
+        [
+            1.4109252119999383,
+            1.5793965889999981,
+            1.3722895870000684
+        ],
+        [
+            1.635907786999951,
+            1.7955330880000702,
+            1.8128875789999483
+        ],
+        [
+            0.29154992499991295,
+            0.32512125999994623,
+            0.3547269459999143
+        ],
+        [
+            0.39792985199994746,
+            0.3251884109999992,
+            0.38865446600004816
+        ],
+        [
+            2.138931853000031,
+            2.105218915000023,
+            2.1956906379999737
+        ],
+        [
+            1.6526281679999784,
+            1.6124568860000181,
+            1.6333364800000254
+        ],
+        [
+            1.6447148869999637,
+            1.6346969300000183,
+            1.6644061450000436
+        ],
+        [
+            2.1882379500000297,
+            1.905917633999934,
+            1.9231305759999486
+        ],
+        [
+            5.352270128999976,
+            5.237135877000014,
+            5.308867133000035
+        ],
+        [
+            5.853392568999993,
+            4.927742889000001,
+            4.913877669000044
+        ],
+        [
+            7.844565224999997,
+            7.873020868000026,
+            8.710109652000028
+        ],
+        [
+            0.08927957200000947,
+            0.028090512999938255,
+            0.02795156100000895
+        ],
+        [
+            0.4280026709999447,
+            0.3811629729999595,
+            0.39364887600004295
+        ],
+        [
+            0.45465083199997025,
+            0.4768754289999606,
+            0.48719940199998746
+        ],
+        [
+            0.7863661489999458,
+            0.7864010189999817,
+            0.7998755679999476
+        ],
+        [
+            1.8717920940000567,
+            1.8261706829999866,
+            2.037369308000052
+        ],
+        [
+            0.8631901180000341,
+            0.7759028259999923,
+            0.7840361930000199
+        ],
+        [
+            1.2422713079999994,
+            1.1825194030000148,
+            1.3301276820000112
+        ],
+        [
+            1.9708939380000174,
+            2.0363808529999687,
+            1.949735128000043
+        ],
+        [
+            9.631319166000026,
+            9.65408143600007,
+            9.62223931699998
+        ],
+        [
+            12.509586186999968,
+            12.429948261999925,
+            13.083593043000064
+        ],
+        [
+            1.6713843260000658,
+            1.5654188460000569,
+            1.2058937670000205
+        ],
+        [
+            0.874348419999933,
+            0.8206400289999465,
+            0.8770680310000216
+        ],
+        [
+            1.471688980999943,
+            1.4348052110000253,
+            1.5063776990000406
+        ],
+        [
+            6.304678343999967,
+            6.261220233000017,
+            6.6019086710000465
+        ],
+        [
+            21.32528498800002,
+            21.617486471999996,
+            21.69860179400007
+        ],
+        [
+            21.58728936099999,
+            22.069379411,
+            22.213318467999898
+        ],
+        [
+            1.6370663789999753,
+            1.123244365000005,
+            1.0656513379999524
+        ],
+        [
+            0.22456605399997898,
+            0.2239348240000254,
+            0.22996692600008828
+        ],
+        [
+            0.15421211200009566,
+            0.1544310240001323,
+            0.15089476100001775
+        ],
+        [
+            0.07680750100007572,
+            0.0744040249999216,
+            0.07324702699997943
+        ],
+        [
+            0.2344550140001047,
+            0.25678796600004716,
+            0.24425511999993432
+        ],
+        [
+            1.0822417379999933,
+            0.8278360040001189,
+            0.8236462620000111
+        ],
+        [
+            0.01962476800008517,
+            0.017242483000018183,
+            0.018496377000019493
+        ],
+        [
+            0.015187977999858049,
+            0.016929013999970266,
+            0.016555773999925805
+        ]
+    ]
+}
\ No newline at end of file