From 1b5630db010e1089d2fa43cf288dd97935520033 Mon Sep 17 00:00:00 2001 From: ritchie Date: Mon, 25 Nov 2024 13:44:52 +0100 Subject: [PATCH 01/10] fix: Fix Polars queries --- polars/query.py | 128 +++++++++++++++++++++++------------------------- 1 file changed, 61 insertions(+), 67 deletions(-) diff --git a/polars/query.py b/polars/query.py index 96df85dd5..269463acf 100755 --- a/polars/query.py +++ b/polars/query.py @@ -3,7 +3,7 @@ import pandas as pd import polars as pl import timeit -import datetime +from datetime import datetime, date import json hits = pd.read_parquet("hits.parquet") @@ -22,48 +22,48 @@ hits[col] = hits[col].astype(str) start = timeit.default_timer() -pl_df = pl.DataFrame(hits) +pl_df = pl.DataFrame(hits).rechunk() stop = timeit.default_timer() load_time = stop - start # 0: No., 1: SQL, 2: Pandas, 3: Polars -queries = queries = [ - ("Q0", "SELECT COUNT(*) FROM hits;", lambda x: x.count(), lambda x: x.height), +queries = [ + ("Q0", "SELECT COUNT(*) FROM hits;", lambda x: x.count(), lambda x: x.select(pl.len()).collect().height), ( "Q1", "SELECT COUNT(*) FROM hits WHERE AdvEngineID <> 0;", lambda x: x[x["AdvEngineID"] != 0].count(), - lambda x: x.filter(pl.col("AdvEngineID") != 0).height, + lambda x: x.select(pl.col("AdvEngineID").filter(pl.col("AdvEngineID") != 0).count()).collect().height, ), ( "Q2", "SELECT SUM(AdvEngineID), COUNT(*), AVG(ResolutionWidth) FROM hits;", lambda x: (x["AdvEngineID"].sum(), x.shape[0], x["ResolutionWidth"].mean()), - lambda x: (x["AdvEngineID"].sum(), x.height, x["ResolutionWidth"].mean()), + lambda x: (x.select(pl.col("advengineid").sum()).collect().item(), x.select(pl.len()).collect().item(), x.select(pl.col("advengineid").mean()).collect().item()), ), ( "Q3", "SELECT AVG(UserID) FROM hits;", lambda x: x["UserID"].mean(), - lambda x: x["UserID"].mean(), + lambda x: x.select(pl.col("UserID").mean()).collect().item(), ), ( "Q4", "SELECT COUNT(DISTINCT UserID) FROM hits;", lambda x: x["UserID"].nunique(), - lambda x: x["UserID"].n_unique(), + lambda x: x.select(pl.col("UserID").n_unique()).collect().item(), ), ( "Q5", "SELECT COUNT(DISTINCT SearchPhrase) FROM hits;", lambda x: x["SearchPhrase"].nunique(), - lambda x: x["SearchPhrase"].n_unique(), + lambda x: x.select(pl.col("SearchPhrase").n_unique()).collect().item(), ), ( "Q6", "SELECT MIN(EventDate), MAX(EventDate) FROM hits;", lambda x: (x["EventDate"].min(), x["EventDate"].max()), - lambda x: (x["EventDate"].min(), x["EventDate"].max()), + lambda x: x.select(pl.col("EventDate").min().alias("e_min"), pl.col("EventDate").max("e_max")).collect().rows()[0] ), ( "Q7", @@ -75,7 +75,7 @@ lambda x: x.filter(pl.col("AdvEngineID") != 0) .group_by("AdvEngineID") .agg(pl.len().alias("count")) - .sort("count", descending=True), + .sort("count", descending=True).collect(), ), ( "Q8", @@ -84,7 +84,7 @@ lambda x: x.group_by("RegionID") .agg(pl.col("UserID").n_unique()) .sort("UserID", descending=True) - .head(10), + .head(10).collect(), ), ( "Q9", @@ -101,7 +101,7 @@ ] ) .sort("AdvEngineID_sum", descending=True) - .head(10), + .head(10).collect(), ), ( "Q10", @@ -114,7 +114,7 @@ .group_by("MobilePhoneModel") .agg(pl.col("UserID").n_unique()) .sort("UserID", descending=True) - .head(10), + .head(10).collect(), ), ( "Q11", @@ -127,7 +127,7 @@ .group_by(["MobilePhone", "MobilePhoneModel"]) .agg(pl.col("UserID").n_unique()) .sort("UserID", descending=True) - .head(10), + .head(10).collect(), ), ( "Q12", @@ -140,7 +140,7 @@ .group_by("SearchPhrase") .agg(pl.len().alias("count")) .sort("count", descending=True) - .head(10), + .head(10).collect(), ), ( "Q13", @@ -153,7 +153,7 @@ .group_by("SearchPhrase") .agg(pl.col("UserID").n_unique()) .sort("UserID", descending=True) - .head(10), + .head(10).collect(), ), ( "Q14", @@ -166,7 +166,7 @@ .group_by(["SearchEngineID", "SearchPhrase"]) .agg(pl.len().alias("count")) .sort("count", descending=True) - .head(10), + .head(10).collect(), ), ( "Q15", @@ -175,7 +175,7 @@ lambda x: x.group_by("UserID") .agg(pl.len().alias("count")) .sort("count", descending=True) - .head(10), + .head(10).collect(), ), ( "Q16", @@ -184,13 +184,13 @@ lambda x: x.group_by(["UserID", "SearchPhrase"]) .agg(pl.len().alias("count")) .sort("count", descending=True) - .head(10), + .head(10).collect(), ), ( "Q17", "SELECT UserID, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, SearchPhrase LIMIT 10;", lambda x: x.groupby(["UserID", "SearchPhrase"]).size().head(10), - lambda x: x.group_by(["UserID", "SearchPhrase"]).agg(pl.len()).head(10), + lambda x: x.group_by(["UserID", "SearchPhrase"]).agg(pl.len()).head(10).collect(), ), ( "Q18", @@ -203,19 +203,19 @@ ) .agg(pl.len().alias("count")) .sort("count", descending=True) - .head(10), + .head(10).collect(), ), ( "Q19", "SELECT UserID FROM hits WHERE UserID = 435090932899640449;", lambda x: x[x["UserID"] == 435090932899640449], - lambda x: x.filter(pl.col("UserID") == 435090932899640449), + lambda x: x.select("UserID").filter(pl.col("UserID") == 435090932899640449).collect(), ), ( "Q20", "SELECT COUNT(*) FROM hits WHERE URL LIKE '%google%';", lambda x: x[x["URL"].str.contains("google")].shape[0], - lambda x: x.filter(pl.col("URL").str.contains("google")).height, + lambda x: x.filter(pl.col("URL").str.contains("google")).select(pl.len()).collect().item(), ), ( "Q21", @@ -230,7 +230,7 @@ .group_by("SearchPhrase") .agg([pl.col("URL").min(), pl.len().alias("count")]) .sort("count", descending=True) - .head(10), + .head(10).collect(), ), ( "Q22", @@ -260,7 +260,7 @@ ] ) .sort("count", descending=True) - .head(10), + .head(10).collect(), ), ( "Q23", @@ -270,7 +270,7 @@ .head(10), lambda x: x.filter(pl.col("URL").str.contains("google")) .sort("EventTime") - .head(10), + .head(10).collect(), ), ( "Q24", @@ -281,7 +281,7 @@ lambda x: x.filter(pl.col("SearchPhrase") != "") .sort("EventTime") .select("SearchPhrase") - .head(10), + .head(10).collect(), ), ( "Q25", @@ -292,7 +292,7 @@ lambda x: x.filter(pl.col("SearchPhrase") != "") .sort("SearchPhrase") .select("SearchPhrase") - .head(10), + .head(10).collect(), ), ( "Q26", @@ -303,7 +303,7 @@ lambda x: x.filter(pl.col("SearchPhrase") != "") .sort(["EventTime", "SearchPhrase"]) .select("SearchPhrase") - .head(10), + .head(10).collect(), ), ( "Q27", @@ -318,15 +318,13 @@ .group_by("CounterID") # GROUP BY CounterID .agg( [ - pl.col("URL") - .map_elements(lambda y: len(y), return_dtype=pl.Int64) - .alias("l"), # AVG(STRLEN(URL)) + pl.col("URL").str.len_chars().mean().alias("l"), # AVG(STRLEN(URL)) pl.len().alias("c"), # COUNT(*) ] ) .filter(pl.col("c") > 100000) # HAVING COUNT(*) > 100000 .sort("l", descending=True) # ORDER BY l DESC - .limit(25), # LIMIT 25, + .limit(25).collect(), # LIMIT 25, ), ( "Q28", @@ -352,18 +350,14 @@ .group_by("k") .agg( [ - pl.col("Referer").map_elements( - lambda y: len(y), return_dtype=pl.Int64 - ) - # .mean() # skip mean for now - .alias("l"), # AVG(STRLEN(Referer)) + pl.col("Referer").str.len_chars().mean().alias("l"), # AVG(STRLEN(Referer)) pl.col("Referer").min().alias("min_referer"), # MIN(Referer) pl.len().alias("c"), # COUNT(*) ] ) .filter(pl.col("c") > 100000) # HAVING COUNT(*) > 100000 .sort("l", descending=True) # ORDER BY l DESC - .limit(25) # LIMIT 25 + .limit(25).collect() # LIMIT 25 ), ), ( @@ -459,7 +453,7 @@ + x["ResolutionWidth"].shift(87).sum() + x["ResolutionWidth"].shift(88).sum() + x["ResolutionWidth"].shift(89).sum(), - lambda x: sum(x["ResolutionWidth"][:90] + pl.Series(range(90))), + lambda x: x.select(pl.sum_horizontal([pl.col("ResolutionWidth").shift(i) for i in range(1, 90)])).collect(), ), ( "Q30", @@ -482,7 +476,7 @@ ] ) .sort("c", descending=True) - .head(10), + .head(10).collect(), ), ( "Q31", @@ -505,7 +499,7 @@ ] ) .sort("c", descending=True) - .head(10), + .head(10).collect(), ), ( "Q32", @@ -526,7 +520,7 @@ ] ) .sort("c", descending=True) - .head(10), + .head(10).collect(), ), ( "Q33", @@ -535,7 +529,7 @@ lambda x: x.group_by("URL") .agg(pl.len().alias("c")) .sort("c", descending=True) - .head(10), + .head(10).collect(), ), ( "Q34", @@ -544,7 +538,7 @@ lambda x: x.group_by("URL") .agg(pl.len().alias("c")) .sort("c", descending=True) - .head(10), + .head(10).collect(), ), ( "Q35", @@ -562,7 +556,7 @@ .group_by(["ClientIP"]) .agg(pl.len().alias("c")) .sort("c", descending=True) - .head(10), + .head(10).collect(), ), ( "Q36", @@ -580,8 +574,8 @@ .nlargest(10), lambda x: x.filter( (pl.col("CounterID") == 62) - & (pl.col("EventDate") >= pl.datetime(2013, 7, 1)) - & (pl.col("EventDate") <= pl.datetime(2013, 7, 31)) + & (pl.col("EventDate") >= datetime(2013, 7, 1)) + & (pl.col("EventDate") <= datetime(2013, 7, 31)) & (pl.col("DontCountHits") == 0) & (pl.col("IsRefresh") == 0) & (pl.col("URL") != "") @@ -589,7 +583,7 @@ .group_by("URL") .agg(pl.len().alias("PageViews")) .sort("PageViews", descending=True) - .head(10), + .head(10).collect(), ), ( "Q37", @@ -607,8 +601,8 @@ .nlargest(10), lambda x: x.filter( (pl.col("CounterID") == 62) - & (pl.col("EventDate") >= pl.datetime(2013, 7, 1)) - & (pl.col("EventDate") <= pl.datetime(2013, 7, 31)) + & (pl.col("EventDate") >= datetime(2013, 7, 1)) + & (pl.col("EventDate") <= datetime(2013, 7, 31)) & (pl.col("DontCountHits") == 0) & (pl.col("IsRefresh") == 0) & (pl.col("Title") != "") @@ -616,7 +610,7 @@ .group_by("Title") .agg(pl.len().alias("PageViews")) .sort("PageViews", descending=True) - .head(10), + .head(10).collect(), ), ( "Q38", @@ -636,8 +630,8 @@ .iloc[1000:1010], lambda x: x.filter( (pl.col("CounterID") == 62) - & (pl.col("EventDate") >= pl.datetime(2013, 7, 1)) - & (pl.col("EventDate") <= pl.datetime(2013, 7, 31)) + & (pl.col("EventDate") >= datetime(2013, 7, 1)) + & (pl.col("EventDate") <= datetime(2013, 7, 31)) & (pl.col("IsRefresh") == 0) & (pl.col("IsLink") != 0) & (pl.col("IsDownload") == 0) @@ -645,7 +639,7 @@ .group_by("URL") .agg(pl.len().alias("PageViews")) .sort("PageViews", descending=True) - .slice(1000, 10), + .slice(1000, 10).collect(), ), ( "Q39", @@ -668,8 +662,8 @@ # note: run with `RUST_BACKTRACE=1` environment variable to display a backtrace # lambda x: x.filter( # (pl.col("CounterID") == 62) - # & (pl.col("EventDate") >= pl.datetime(2013, 7, 1)) - # & (pl.col("EventDate") <= pl.datetime(2013, 7, 31)) + # & (pl.col("EventDate") >= datetime(2013, 7, 1)) + # & (pl.col("EventDate") <= datetime(2013, 7, 31)) # & (pl.col("IsRefresh") == 0) # ) # .group_by( @@ -706,8 +700,8 @@ .iloc[100:110], lambda x: x.filter( (pl.col("CounterID") == 62) - & (pl.col("EventDate") >= pl.datetime(2013, 7, 1)) - & (pl.col("EventDate") <= pl.datetime(2013, 7, 31)) + & (pl.col("EventDate") >= datetime(2013, 7, 1)) + & (pl.col("EventDate") <= datetime(2013, 7, 31)) & (pl.col("IsRefresh") == 0) & (pl.col("TraficSourceID").is_in([-1, 6])) & (pl.col("RefererHash") == 3594120000172545465) @@ -715,7 +709,7 @@ .group_by(["URLHash", "EventDate"]) .agg(pl.len().alias("PageViews")) .sort("PageViews", descending=True) - .slice(100, 10), + .slice(100, 10).collect(), ), ( "Q41", @@ -735,8 +729,8 @@ .iloc[10000:10010], lambda x: x.filter( (pl.col("CounterID") == 62) - & (pl.col("EventDate") >= pl.datetime(2013, 7, 1)) - & (pl.col("EventDate") <= pl.datetime(2013, 7, 31)) + & (pl.col("EventDate") >= datetime(2013, 7, 1)) + & (pl.col("EventDate") <= datetime(2013, 7, 31)) & (pl.col("IsRefresh") == 0) & (pl.col("DontCountHits") == 0) & (pl.col("URLHash") == 2868770270353813622) @@ -744,7 +738,7 @@ .group_by(["WindowClientWidth", "WindowClientHeight"]) .agg(pl.len().alias("PageViews")) .sort("PageViews", descending=True) - .slice(10000, 10), + .slice(10000, 10).collect(), ), ( "Q42", @@ -766,8 +760,8 @@ # expected leading integer in the duration string, found m # lambda x: x.filter( # (pl.col("CounterID") == 62) - # & (pl.col("EventDate") >= pl.datetime(2013, 7, 14)) - # & (pl.col("EventDate") <= pl.datetime(2013, 7, 15)) + # & (pl.col("EventDate") >= datetime(2013, 7, 14)) + # & (pl.col("EventDate") <= datetime(2013, 7, 15)) # & (pl.col("IsRefresh") == 0) # & (pl.col("DontCountHits") == 0) # ) @@ -792,7 +786,7 @@ result_json = { "system": "Polars (DataFrame)", - "date": datetime.date.today().strftime("%Y-%m-%d"), + "date": date.today().strftime("%Y-%m-%d"), "machine": "c6a.metal, 500gb gp2", "cluster_size": 1, "comment": "", From 77c10574f58e14f5205abd8e5c7d32f161d6abfe Mon Sep 17 00:00:00 2001 From: ritchie Date: Mon, 25 Nov 2024 13:46:32 +0100 Subject: [PATCH 02/10] single query --- polars/query.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/polars/query.py b/polars/query.py index 269463acf..75eb3d8c6 100755 --- a/polars/query.py +++ b/polars/query.py @@ -39,7 +39,7 @@ "Q2", "SELECT SUM(AdvEngineID), COUNT(*), AVG(ResolutionWidth) FROM hits;", lambda x: (x["AdvEngineID"].sum(), x.shape[0], x["ResolutionWidth"].mean()), - lambda x: (x.select(pl.col("advengineid").sum()).collect().item(), x.select(pl.len()).collect().item(), x.select(pl.col("advengineid").mean()).collect().item()), + lambda x: x.select(pl.col("advengineid").sum(), pl.len(), pl.col("advengineid").mean()).collect().rows()[0], ), ( "Q3", From 72d910b945a41a15d471b671e70106812ebddf27 Mon Sep 17 00:00:00 2001 From: ritchie Date: Mon, 25 Nov 2024 13:51:05 +0100 Subject: [PATCH 03/10] pass lazy --- polars/query.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/polars/query.py b/polars/query.py index 75eb3d8c6..9afbb5fa2 100755 --- a/polars/query.py +++ b/polars/query.py @@ -22,7 +22,7 @@ hits[col] = hits[col].astype(str) start = timeit.default_timer() -pl_df = pl.DataFrame(hits).rechunk() +pl_df = pl.DataFrame(hits).rechunk().lazy() stop = timeit.default_timer() load_time = stop - start From bd16ac32e11bdda674944610ec6e679254641676 Mon Sep 17 00:00:00 2001 From: ritchie Date: Tue, 26 Nov 2024 08:26:51 +0100 Subject: [PATCH 04/10] remove pandas and fix all queries --- polars/benchmark.sh | 2 +- polars/query.py | 407 ++++++-------------------------------------- 2 files changed, 50 insertions(+), 359 deletions(-) diff --git a/polars/benchmark.sh b/polars/benchmark.sh index 16980aa40..10ad7e61f 100755 --- a/polars/benchmark.sh +++ b/polars/benchmark.sh @@ -4,7 +4,7 @@ sudo apt-get update sudo apt-get install -y python3-pip -pip install --break-system-packages pandas polars +pip install --break-system-packages polars # Download the data wget --no-verbose --continue https://datasets.clickhouse.com/hits_compatible/athena/hits.parquet diff --git a/polars/query.py b/polars/query.py index 9afbb5fa2..feb798f26 100755 --- a/polars/query.py +++ b/polars/query.py @@ -1,77 +1,62 @@ #!/usr/bin/env python3 -import pandas as pd import polars as pl import timeit from datetime import datetime, date import json -hits = pd.read_parquet("hits.parquet") - -dataframe_size = hits.memory_usage().sum() +start = timeit.default_timer() +df = pl.scan_parquet("hits.parquet").collect() +stop = timeit.default_timer() +load_time = stop - start -# print("Dataframe(numpy) size:", dataframe_size, "bytes") # fix some types -hits["EventTime"] = pd.to_datetime(hits["EventTime"], unit="s") -hits["EventDate"] = pd.to_datetime(hits["EventDate"], unit="D") - -# fix all object columns to string -for col in hits.columns: - if hits[col].dtype == "O": - hits[col] = hits[col].astype(str) +df = df.with_columns( + (pl.col("EventTime") * 1000).cast(pl.Datetime(time_unit="ms")), + pl.col("EventDate").cast(pl.Date), +) +assert df["EventTime"][0].year == 2013 +df = df.rechunk() -start = timeit.default_timer() -pl_df = pl.DataFrame(hits).rechunk().lazy() -stop = timeit.default_timer() -load_time = stop - start +lf = df.lazy() -# 0: No., 1: SQL, 2: Pandas, 3: Polars +# 0: No., 1: SQL, 3: Polars queries = [ - ("Q0", "SELECT COUNT(*) FROM hits;", lambda x: x.count(), lambda x: x.select(pl.len()).collect().height), + ("Q0", "SELECT COUNT(*) FROM hits;", lambda x: x.select(pl.len()).collect().height), ( "Q1", "SELECT COUNT(*) FROM hits WHERE AdvEngineID <> 0;", - lambda x: x[x["AdvEngineID"] != 0].count(), lambda x: x.select(pl.col("AdvEngineID").filter(pl.col("AdvEngineID") != 0).count()).collect().height, ), ( "Q2", "SELECT SUM(AdvEngineID), COUNT(*), AVG(ResolutionWidth) FROM hits;", - lambda x: (x["AdvEngineID"].sum(), x.shape[0], x["ResolutionWidth"].mean()), - lambda x: x.select(pl.col("advengineid").sum(), pl.len(), pl.col("advengineid").mean()).collect().rows()[0], + lambda x: x.select(a_sum=pl.col("AdvEngineID").sum(), count=pl.len(), a_mean=pl.col("AdvEngineID").mean()).collect().rows()[0], ), ( "Q3", "SELECT AVG(UserID) FROM hits;", - lambda x: x["UserID"].mean(), lambda x: x.select(pl.col("UserID").mean()).collect().item(), ), ( "Q4", "SELECT COUNT(DISTINCT UserID) FROM hits;", - lambda x: x["UserID"].nunique(), lambda x: x.select(pl.col("UserID").n_unique()).collect().item(), ), ( "Q5", "SELECT COUNT(DISTINCT SearchPhrase) FROM hits;", - lambda x: x["SearchPhrase"].nunique(), lambda x: x.select(pl.col("SearchPhrase").n_unique()).collect().item(), ), ( "Q6", "SELECT MIN(EventDate), MAX(EventDate) FROM hits;", - lambda x: (x["EventDate"].min(), x["EventDate"].max()), - lambda x: x.select(pl.col("EventDate").min().alias("e_min"), pl.col("EventDate").max("e_max")).collect().rows()[0] + lambda x: x.select(e_min=pl.col("EventDate").min(), e_max=pl.col("EventDate").max()).collect().rows()[0] ), ( "Q7", "SELECT AdvEngineID, COUNT(*) FROM hits WHERE AdvEngineID <> 0 GROUP BY AdvEngineID ORDER BY COUNT(*) DESC;", - lambda x: x[x["AdvEngineID"] != 0] - .groupby("AdvEngineID") - .size() - .sort_values(ascending=False), lambda x: x.filter(pl.col("AdvEngineID") != 0) .group_by("AdvEngineID") .agg(pl.len().alias("count")) @@ -80,7 +65,6 @@ ( "Q8", "SELECT RegionID, COUNT(DISTINCT UserID) AS u FROM hits GROUP BY RegionID ORDER BY u DESC LIMIT 10;", - lambda x: x.groupby("RegionID")["UserID"].nunique().nlargest(10), lambda x: x.group_by("RegionID") .agg(pl.col("UserID").n_unique()) .sort("UserID", descending=True) @@ -89,9 +73,6 @@ ( "Q9", "SELECT RegionID, SUM(AdvEngineID), COUNT(*) AS c, AVG(ResolutionWidth), COUNT(DISTINCT UserID) FROM hits GROUP BY RegionID ORDER BY c DESC LIMIT 10;", - lambda x: x.groupby("RegionID") - .agg({"AdvEngineID": "sum", "ResolutionWidth": "mean", "UserID": "nunique"}) - .nlargest(10, "AdvEngineID"), lambda x: x.group_by("RegionID") .agg( [ @@ -106,10 +87,6 @@ ( "Q10", "SELECT MobilePhoneModel, COUNT(DISTINCT UserID) AS u FROM hits WHERE MobilePhoneModel <> '' GROUP BY MobilePhoneModel ORDER BY u DESC LIMIT 10;", - lambda x: x[x["MobilePhoneModel"] != ""] - .groupby("MobilePhoneModel")["UserID"] - .nunique() - .nlargest(10), lambda x: x.filter(pl.col("MobilePhoneModel") != "") .group_by("MobilePhoneModel") .agg(pl.col("UserID").n_unique()) @@ -119,10 +96,6 @@ ( "Q11", "SELECT MobilePhone, MobilePhoneModel, COUNT(DISTINCT UserID) AS u FROM hits WHERE MobilePhoneModel <> '' GROUP BY MobilePhone, MobilePhoneModel ORDER BY u DESC LIMIT 10;", - lambda x: x[x["MobilePhoneModel"] != ""] - .groupby(["MobilePhone", "MobilePhoneModel"])["UserID"] - .nunique() - .nlargest(10), lambda x: x.filter(pl.col("MobilePhoneModel") != "") .group_by(["MobilePhone", "MobilePhoneModel"]) .agg(pl.col("UserID").n_unique()) @@ -132,10 +105,6 @@ ( "Q12", "SELECT SearchPhrase, COUNT(*) AS c FROM hits WHERE SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10;", - lambda x: x[x["SearchPhrase"] != ""] - .groupby("SearchPhrase") - .size() - .nlargest(10), lambda x: x.filter(pl.col("SearchPhrase") != "") .group_by("SearchPhrase") .agg(pl.len().alias("count")) @@ -145,10 +114,6 @@ ( "Q13", "SELECT SearchPhrase, COUNT(DISTINCT UserID) AS u FROM hits WHERE SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY u DESC LIMIT 10;", - lambda x: x[x["SearchPhrase"] != ""] - .groupby("SearchPhrase")["UserID"] - .nunique() - .nlargest(10), lambda x: x.filter(pl.col("SearchPhrase") != "") .group_by("SearchPhrase") .agg(pl.col("UserID").n_unique()) @@ -158,10 +123,6 @@ ( "Q14", "SELECT SearchEngineID, SearchPhrase, COUNT(*) AS c FROM hits WHERE SearchPhrase <> '' GROUP BY SearchEngineID, SearchPhrase ORDER BY c DESC LIMIT 10;", - lambda x: x[x["SearchPhrase"] != ""] - .groupby(["SearchEngineID", "SearchPhrase"]) - .size() - .nlargest(10), lambda x: x.filter(pl.col("SearchPhrase") != "") .group_by(["SearchEngineID", "SearchPhrase"]) .agg(pl.len().alias("count")) @@ -171,7 +132,6 @@ ( "Q15", "SELECT UserID, COUNT(*) FROM hits GROUP BY UserID ORDER BY COUNT(*) DESC LIMIT 10;", - lambda x: x.groupby("UserID").size().nlargest(10), lambda x: x.group_by("UserID") .agg(pl.len().alias("count")) .sort("count", descending=True) @@ -180,7 +140,6 @@ ( "Q16", "SELECT UserID, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, SearchPhrase ORDER BY COUNT(*) DESC LIMIT 10;", - lambda x: x.groupby(["UserID", "SearchPhrase"]).size().nlargest(10), lambda x: x.group_by(["UserID", "SearchPhrase"]) .agg(pl.len().alias("count")) .sort("count", descending=True) @@ -189,17 +148,13 @@ ( "Q17", "SELECT UserID, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, SearchPhrase LIMIT 10;", - lambda x: x.groupby(["UserID", "SearchPhrase"]).size().head(10), lambda x: x.group_by(["UserID", "SearchPhrase"]).agg(pl.len()).head(10).collect(), ), ( "Q18", "SELECT UserID, extract(minute FROM EventTime) AS m, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, m, SearchPhrase ORDER BY COUNT(*) DESC LIMIT 10;", - lambda x: x.groupby([x["UserID"], x["EventTime"].dt.minute, "SearchPhrase"]) - .size() - .nlargest(10), lambda x: x.group_by( - [pl.col("UserID"), x["EventTime"].dt.minute(), "SearchPhrase"] + [pl.col("UserID"), pl.col("EventTime").dt.minute(), "SearchPhrase"] ) .agg(pl.len().alias("count")) .sort("count", descending=True) @@ -208,22 +163,16 @@ ( "Q19", "SELECT UserID FROM hits WHERE UserID = 435090932899640449;", - lambda x: x[x["UserID"] == 435090932899640449], lambda x: x.select("UserID").filter(pl.col("UserID") == 435090932899640449).collect(), ), ( "Q20", "SELECT COUNT(*) FROM hits WHERE URL LIKE '%google%';", - lambda x: x[x["URL"].str.contains("google")].shape[0], lambda x: x.filter(pl.col("URL").str.contains("google")).select(pl.len()).collect().item(), ), ( "Q21", "SELECT SearchPhrase, MIN(URL), COUNT(*) AS c FROM hits WHERE URL LIKE '%google%' AND SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10;", - lambda x: x[(x["URL"].str.contains("google")) & (x["SearchPhrase"] != "")] - .groupby("SearchPhrase") - .agg({"URL": "min", "SearchPhrase": "size"}) - .nlargest(10, "SearchPhrase"), lambda x: x.filter( (pl.col("URL").str.contains("google")) & (pl.col("SearchPhrase") != "") ) @@ -235,16 +184,6 @@ ( "Q22", "SELECT SearchPhrase, MIN(URL), MIN(Title), COUNT(*) AS c, COUNT(DISTINCT UserID) FROM hits WHERE Title LIKE '%Google%' AND URL NOT LIKE '%.google.%' AND SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10;", - lambda x: x[ - (x["Title"].str.contains("Google")) - & (~x["URL"].str.contains(".google.")) - & (x["SearchPhrase"] != "") - ] - .groupby("SearchPhrase") - .agg( - {"URL": "min", "Title": "min", "SearchPhrase": "size", "UserID": "nunique"} - ) - .nlargest(10, "SearchPhrase"), lambda x: x.filter( (pl.col("Title").str.contains("Google")) & (~pl.col("URL").str.contains(".google.")) @@ -265,9 +204,6 @@ ( "Q23", "SELECT * FROM hits WHERE URL LIKE '%google%' ORDER BY EventTime LIMIT 10;", - lambda x: x[x["URL"].str.contains("google")] - .sort_values(by="EventTime") - .head(10), lambda x: x.filter(pl.col("URL").str.contains("google")) .sort("EventTime") .head(10).collect(), @@ -275,9 +211,6 @@ ( "Q24", "SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY EventTime LIMIT 10;", - lambda x: x[x["SearchPhrase"] != ""] - .sort_values(by="EventTime")[["SearchPhrase"]] - .head(10), lambda x: x.filter(pl.col("SearchPhrase") != "") .sort("EventTime") .select("SearchPhrase") @@ -286,9 +219,6 @@ ( "Q25", "SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY SearchPhrase LIMIT 10;", - lambda x: x[x["SearchPhrase"] != ""] - .sort_values(by="SearchPhrase")[["SearchPhrase"]] - .head(10), lambda x: x.filter(pl.col("SearchPhrase") != "") .sort("SearchPhrase") .select("SearchPhrase") @@ -297,9 +227,6 @@ ( "Q26", "SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY EventTime, SearchPhrase LIMIT 10;", - lambda x: x[x["SearchPhrase"] != ""] - .sort_values(by=["EventTime", "SearchPhrase"])[["SearchPhrase"]] - .head(10), lambda x: x.filter(pl.col("SearchPhrase") != "") .sort(["EventTime", "SearchPhrase"]) .select("SearchPhrase") @@ -308,12 +235,6 @@ ( "Q27", "SELECT CounterID, AVG(STRLEN(URL)) AS l, COUNT(*) AS c FROM hits WHERE URL <> '' GROUP BY CounterID HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25;", - lambda x: x[x["URL"] != ""] - .groupby("CounterID") - .filter(lambda g: g["URL"].count() > 100000) - .agg({"URL": lambda url: url.str.len().mean(), "CounterID": "size"}) - .sort_values() - .head(25), lambda x: x.filter(pl.col("URL") != "") # WHERE URL <> '' .group_by("CounterID") # GROUP BY CounterID .agg( @@ -329,17 +250,6 @@ ( "Q28", "SELECT REGEXP_REPLACE(Referer, '^https?://(?:www\\.)?([^/]+)/.*$', '\\1') AS k, AVG(STRLEN(Referer)) AS l, COUNT(*) AS c, MIN(Referer) FROM hits WHERE Referer <> '' GROUP BY k HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25;", - lambda x: ( - x[x["Referer"] != ""] - .assign(k=x["Referer"].str.extract(r"^https?://(?:www\.)?([^/]+)/.*$")[0]) - .groupby("k") - .filter(lambda g: g["Referer"].count() > 100000) - .agg( - min_referer=("Referer", "min"), - average_length=("Referer", lambda r: r.str.len().mean()), - ) - .head(25) - ), lambda x: ( x.filter(pl.col("Referer") != "") .with_columns( @@ -363,109 +273,11 @@ ( "Q29", "SELECT SUM(ResolutionWidth), SUM(ResolutionWidth + 1), SUM(ResolutionWidth + 2), SUM(ResolutionWidth + 3), SUM(ResolutionWidth + 4), SUM(ResolutionWidth + 5), SUM(ResolutionWidth + 6), SUM(ResolutionWidth + 7), SUM(ResolutionWidth + 8), SUM(ResolutionWidth + 9), SUM(ResolutionWidth + 10), SUM(ResolutionWidth + 11), SUM(ResolutionWidth + 12), SUM(ResolutionWidth + 13), SUM(ResolutionWidth + 14), SUM(ResolutionWidth + 15), SUM(ResolutionWidth + 16), SUM(ResolutionWidth + 17), SUM(ResolutionWidth + 18), SUM(ResolutionWidth + 19), SUM(ResolutionWidth + 20), SUM(ResolutionWidth + 21), SUM(ResolutionWidth + 22), SUM(ResolutionWidth + 23), SUM(ResolutionWidth + 24), SUM(ResolutionWidth + 25), SUM(ResolutionWidth + 26), SUM(ResolutionWidth + 27), SUM(ResolutionWidth + 28), SUM(ResolutionWidth + 29), SUM(ResolutionWidth + 30), SUM(ResolutionWidth + 31), SUM(ResolutionWidth + 32), SUM(ResolutionWidth + 33), SUM(ResolutionWidth + 34), SUM(ResolutionWidth + 35), SUM(ResolutionWidth + 36), SUM(ResolutionWidth + 37), SUM(ResolutionWidth + 38), SUM(ResolutionWidth + 39), SUM(ResolutionWidth + 40), SUM(ResolutionWidth + 41), SUM(ResolutionWidth + 42), SUM(ResolutionWidth + 43), SUM(ResolutionWidth + 44), SUM(ResolutionWidth + 45), SUM(ResolutionWidth + 46), SUM(ResolutionWidth + 47), SUM(ResolutionWidth + 48), SUM(ResolutionWidth + 49), SUM(ResolutionWidth + 50), SUM(ResolutionWidth + 51), SUM(ResolutionWidth + 52), SUM(ResolutionWidth + 53), SUM(ResolutionWidth + 54), SUM(ResolutionWidth + 55), SUM(ResolutionWidth + 56), SUM(ResolutionWidth + 57), SUM(ResolutionWidth + 58), SUM(ResolutionWidth + 59), SUM(ResolutionWidth + 60), SUM(ResolutionWidth + 61), SUM(ResolutionWidth + 62), SUM(ResolutionWidth + 63), SUM(ResolutionWidth + 64), SUM(ResolutionWidth + 65), SUM(ResolutionWidth + 66), SUM(ResolutionWidth + 67), SUM(ResolutionWidth + 68), SUM(ResolutionWidth + 69), SUM(ResolutionWidth + 70), SUM(ResolutionWidth + 71), SUM(ResolutionWidth + 72), SUM(ResolutionWidth + 73), SUM(ResolutionWidth + 74), SUM(ResolutionWidth + 75), SUM(ResolutionWidth + 76), SUM(ResolutionWidth + 77), SUM(ResolutionWidth + 78), SUM(ResolutionWidth + 79), SUM(ResolutionWidth + 80), SUM(ResolutionWidth + 81), SUM(ResolutionWidth + 82), SUM(ResolutionWidth + 83), SUM(ResolutionWidth + 84), SUM(ResolutionWidth + 85), SUM(ResolutionWidth + 86), SUM(ResolutionWidth + 87), SUM(ResolutionWidth + 88), SUM(ResolutionWidth + 89) FROM hits;", - lambda x: x["ResolutionWidth"].sum() - + x["ResolutionWidth"].shift(1).sum() - + x["ResolutionWidth"].shift(2).sum() - + x["ResolutionWidth"].shift(3).sum() - + x["ResolutionWidth"].shift(4).sum() - + x["ResolutionWidth"].shift(5).sum() - + x["ResolutionWidth"].shift(6).sum() - + x["ResolutionWidth"].shift(7).sum() - + x["ResolutionWidth"].shift(8).sum() - + x["ResolutionWidth"].shift(9).sum() - + x["ResolutionWidth"].shift(10).sum() - + x["ResolutionWidth"].shift(11).sum() - + x["ResolutionWidth"].shift(12).sum() - + x["ResolutionWidth"].shift(13).sum() - + x["ResolutionWidth"].shift(14).sum() - + x["ResolutionWidth"].shift(15).sum() - + x["ResolutionWidth"].shift(16).sum() - + x["ResolutionWidth"].shift(17).sum() - + x["ResolutionWidth"].shift(18).sum() - + x["ResolutionWidth"].shift(19).sum() - + x["ResolutionWidth"].shift(20).sum() - + x["ResolutionWidth"].shift(21).sum() - + x["ResolutionWidth"].shift(22).sum() - + x["ResolutionWidth"].shift(23).sum() - + x["ResolutionWidth"].shift(24).sum() - + x["ResolutionWidth"].shift(25).sum() - + x["ResolutionWidth"].shift(26).sum() - + x["ResolutionWidth"].shift(27).sum() - + x["ResolutionWidth"].shift(28).sum() - + x["ResolutionWidth"].shift(29).sum() - + x["ResolutionWidth"].shift(30).sum() - + x["ResolutionWidth"].shift(31).sum() - + x["ResolutionWidth"].shift(32).sum() - + x["ResolutionWidth"].shift(33).sum() - + x["ResolutionWidth"].shift(34).sum() - + x["ResolutionWidth"].shift(35).sum() - + x["ResolutionWidth"].shift(36).sum() - + x["ResolutionWidth"].shift(37).sum() - + x["ResolutionWidth"].shift(38).sum() - + x["ResolutionWidth"].shift(39).sum() - + x["ResolutionWidth"].shift(40).sum() - + x["ResolutionWidth"].shift(41).sum() - + x["ResolutionWidth"].shift(42).sum() - + x["ResolutionWidth"].shift(43).sum() - + x["ResolutionWidth"].shift(44).sum() - + x["ResolutionWidth"].shift(45).sum() - + x["ResolutionWidth"].shift(46).sum() - + x["ResolutionWidth"].shift(47).sum() - + x["ResolutionWidth"].shift(48).sum() - + x["ResolutionWidth"].shift(49).sum() - + x["ResolutionWidth"].shift(50).sum() - + x["ResolutionWidth"].shift(51).sum() - + x["ResolutionWidth"].shift(52).sum() - + x["ResolutionWidth"].shift(53).sum() - + x["ResolutionWidth"].shift(54).sum() - + x["ResolutionWidth"].shift(55).sum() - + x["ResolutionWidth"].shift(56).sum() - + x["ResolutionWidth"].shift(57).sum() - + x["ResolutionWidth"].shift(58).sum() - + x["ResolutionWidth"].shift(59).sum() - + x["ResolutionWidth"].shift(60).sum() - + x["ResolutionWidth"].shift(61).sum() - + x["ResolutionWidth"].shift(62).sum() - + x["ResolutionWidth"].shift(63).sum() - + x["ResolutionWidth"].shift(64).sum() - + x["ResolutionWidth"].shift(65).sum() - + x["ResolutionWidth"].shift(66).sum() - + x["ResolutionWidth"].shift(67).sum() - + x["ResolutionWidth"].shift(68).sum() - + x["ResolutionWidth"].shift(69).sum() - + x["ResolutionWidth"].shift(70).sum() - + x["ResolutionWidth"].shift(71).sum() - + x["ResolutionWidth"].shift(72).sum() - + x["ResolutionWidth"].shift(73).sum() - + x["ResolutionWidth"].shift(74).sum() - + x["ResolutionWidth"].shift(75).sum() - + x["ResolutionWidth"].shift(76).sum() - + x["ResolutionWidth"].shift(77).sum() - + x["ResolutionWidth"].shift(78).sum() - + x["ResolutionWidth"].shift(79).sum() - + x["ResolutionWidth"].shift(80).sum() - + x["ResolutionWidth"].shift(81).sum() - + x["ResolutionWidth"].shift(82).sum() - + x["ResolutionWidth"].shift(83).sum() - + x["ResolutionWidth"].shift(84).sum() - + x["ResolutionWidth"].shift(85).sum() - + x["ResolutionWidth"].shift(86).sum() - + x["ResolutionWidth"].shift(87).sum() - + x["ResolutionWidth"].shift(88).sum() - + x["ResolutionWidth"].shift(89).sum(), lambda x: x.select(pl.sum_horizontal([pl.col("ResolutionWidth").shift(i) for i in range(1, 90)])).collect(), ), ( "Q30", "SELECT SearchEngineID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits WHERE SearchPhrase <> '' GROUP BY SearchEngineID, ClientIP ORDER BY c DESC LIMIT 10;", - lambda x: x[x["SearchPhrase"] != ""] - .groupby(["SearchEngineID", "ClientIP"]) - .agg( - c=("SearchEngineID", "size"), - IsRefreshSum=("IsRefresh", "sum"), - AvgResolutionWidth=("ResolutionWidth", "mean"), - ) - .nlargest(10, "c"), lambda x: x.filter(pl.col("SearchPhrase") != "") .group_by(["SearchEngineID", "ClientIP"]) .agg( @@ -481,14 +293,6 @@ ( "Q31", "SELECT WatchID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits WHERE SearchPhrase <> '' GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10;", - lambda x: x[x["SearchPhrase"] != ""] - .groupby(["WatchID", "ClientIP"]) - .agg( - c=("WatchID", "size"), - IsRefreshSum=("IsRefresh", "sum"), - AvgResolutionWidth=("ResolutionWidth", "mean"), - ) - .nlargest(10, "c"), lambda x: x.filter(pl.col("SearchPhrase") != "") .group_by(["WatchID", "ClientIP"]) .agg( @@ -504,13 +308,6 @@ ( "Q32", "SELECT WatchID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10;", - lambda x: x.groupby(["WatchID", "ClientIP"]) - .agg( - c=("WatchID", "size"), - IsRefreshSum=("IsRefresh", "sum"), - AvgResolutionWidth=("ResolutionWidth", "mean"), - ) - .nlargest(10, "c"), lambda x: x.group_by(["WatchID", "ClientIP"]) .agg( [ @@ -525,7 +322,6 @@ ( "Q33", "SELECT URL, COUNT(*) AS c FROM hits GROUP BY URL ORDER BY c DESC LIMIT 10;", - lambda x: x.groupby("URL").size().nlargest(10).reset_index(name="c"), lambda x: x.group_by("URL") .agg(pl.len().alias("c")) .sort("c", descending=True) @@ -534,7 +330,6 @@ ( "Q34", "SELECT 1, URL, COUNT(*) AS c FROM hits GROUP BY 1, URL ORDER BY c DESC LIMIT 10;", - lambda x: x.groupby(["URL"]).size().nlargest(10).reset_index(name="c"), lambda x: x.group_by("URL") .agg(pl.len().alias("c")) .sort("c", descending=True) @@ -543,15 +338,6 @@ ( "Q35", "SELECT ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3, COUNT(*) AS c FROM hits GROUP BY ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3 ORDER BY c DESC LIMIT 10;", - lambda x: x.assign( - **{f"ClientIP_minus_{i}": x["ClientIP"] - i for i in range(1, 4)} - ) - .groupby( - ["ClientIP", "ClientIP_minus_1", "ClientIP_minus_2", "ClientIP_minus_3"] - ) - .size() - .nlargest(10) - .reset_index(name="c"), lambda x: x.with_columns([pl.col("ClientIP")]) .group_by(["ClientIP"]) .agg(pl.len().alias("c")) @@ -561,17 +347,6 @@ ( "Q36", "SELECT URL, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND DontCountHits = 0 AND IsRefresh = 0 AND URL <> '' GROUP BY URL ORDER BY PageViews DESC LIMIT 10;", - lambda x: x[ - (x["CounterID"] == 62) - & (x["EventDate"] >= "2013-07-01") - & (x["EventDate"] <= "2013-07-31") - & (x["DontCountHits"] == 0) - & (x["IsRefresh"] == 0) - & (x["URL"] != "") - ] - .groupby("URL") - .size() - .nlargest(10), lambda x: x.filter( (pl.col("CounterID") == 62) & (pl.col("EventDate") >= datetime(2013, 7, 1)) @@ -588,17 +363,6 @@ ( "Q37", "SELECT Title, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND DontCountHits = 0 AND IsRefresh = 0 AND Title <> '' GROUP BY Title ORDER BY PageViews DESC LIMIT 10;", - lambda x: x[ - (x["CounterID"] == 62) - & (x["EventDate"] >= "2013-07-01") - & (x["EventDate"] <= "2013-07-31") - & (x["DontCountHits"] == 0) - & (x["IsRefresh"] == 0) - & (x["Title"] != "") - ] - .groupby("Title") - .size() - .nlargest(10), lambda x: x.filter( (pl.col("CounterID") == 62) & (pl.col("EventDate") >= datetime(2013, 7, 1)) @@ -615,19 +379,6 @@ ( "Q38", "SELECT URL, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND IsLink <> 0 AND IsDownload = 0 GROUP BY URL ORDER BY PageViews DESC LIMIT 10 OFFSET 1000;", - lambda x: x[ - (x["CounterID"] == 62) - & (x["EventDate"] >= "2013-07-01") - & (x["EventDate"] <= "2013-07-31") - & (x["IsRefresh"] == 0) - & (x["IsLink"] != 0) - & (x["IsDownload"] == 0) - ] - .groupby("URL") - .size() - .nlargest(10) - .reset_index(name="PageViews") - .iloc[1000:1010], lambda x: x.filter( (pl.col("CounterID") == 62) & (pl.col("EventDate") >= datetime(2013, 7, 1)) @@ -644,60 +395,31 @@ ( "Q39", "SELECT TraficSourceID, SearchEngineID, AdvEngineID, CASE WHEN (SearchEngineID = 0 AND AdvEngineID = 0) THEN Referer ELSE '' END AS Src, URL AS Dst, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 GROUP BY TraficSourceID, SearchEngineID, AdvEngineID, Src, Dst ORDER BY PageViews DESC LIMIT 10 OFFSET 1000;", - lambda x: x[ - (x["CounterID"] == 62) - & (x["EventDate"] >= "2013-07-01") - & (x["EventDate"] <= "2013-07-31") - & (x["IsRefresh"] == 0) - ] - .groupby(["TraficSourceID", "SearchEngineID", "AdvEngineID", "Referer", "URL"]) - .size() - .nlargest(10) - .reset_index(name="PageViews") - .iloc[1000:1010], - lambda x: None, - # Crash with: - # thread '' panicked at crates/polars-time/src/windows/duration.rs:215:21: - # expected leading integer in the duration string, found m - # note: run with `RUST_BACKTRACE=1` environment variable to display a backtrace - # lambda x: x.filter( - # (pl.col("CounterID") == 62) - # & (pl.col("EventDate") >= datetime(2013, 7, 1)) - # & (pl.col("EventDate") <= datetime(2013, 7, 31)) - # & (pl.col("IsRefresh") == 0) - # ) - # .group_by( - # [ - # "TraficSourceID", - # "SearchEngineID", - # "AdvEngineID", - # # pl.when(pl.col("SearchEngineID").eq(0) & pl.col("AdvEngineID").eq(0)) - # # .then(pl.col("Referer")) - # # .otherwise("") - # # .alias("Src"), - # "URL", - # ] - # ) - # .agg(pl.len().alias("PageViews")) - # .sort("PageViews", descending=True) - # .slice(1000, 10), + lambda x: x.filter( + (pl.col("CounterID") == 62) + & (pl.col("EventDate") >= datetime(2013, 7, 1)) + & (pl.col("EventDate") <= datetime(2013, 7, 31)) + & (pl.col("IsRefresh") == 0) + ) + .group_by( + [ + "TraficSourceID", + "SearchEngineID", + "AdvEngineID", + pl.when(pl.col("SearchEngineID").eq(0) & pl.col("AdvEngineID").eq(0)) + .then(pl.col("Referer")) + .otherwise("") + .alias("Src"), + "URL", + ] + ) + .agg(pl.len().alias("PageViews")) + .sort("PageViews", descending=True) + .slice(1000, 10), ), ( "Q40", "SELECT URLHash, EventDate, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND TraficSourceID IN (-1, 6) AND RefererHash = 3594120000172545465 GROUP BY URLHash, EventDate ORDER BY PageViews DESC LIMIT 10 OFFSET 100;", - lambda x: x[ - (x["CounterID"] == 62) - & (x["EventDate"] >= "2013-07-01") - & (x["EventDate"] <= "2013-07-31") - & (x["IsRefresh"] == 0) - & (x["TraficSourceID"].isin([-1, 6])) - & (x["RefererHash"] == 3594120000172545465) - ] - .groupby(["URLHash", "EventDate"]) - .size() - .nlargest(10) - .reset_index(name="PageViews") - .iloc[100:110], lambda x: x.filter( (pl.col("CounterID") == 62) & (pl.col("EventDate") >= datetime(2013, 7, 1)) @@ -714,19 +436,6 @@ ( "Q41", "SELECT WindowClientWidth, WindowClientHeight, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND DontCountHits = 0 AND URLHash = 2868770270353813622 GROUP BY WindowClientWidth, WindowClientHeight ORDER BY PageViews DESC LIMIT 10 OFFSET 10000;", - lambda x: x[ - (x["CounterID"] == 62) - & (x["EventDate"] >= "2013-07-01") - & (x["EventDate"] <= "2013-07-31") - & (x["IsRefresh"] == 0) - & (x["DontCountHits"] == 0) - & (x["URLHash"] == 2868770270353813622) - ] - .groupby(["WindowClientWidth", "WindowClientHeight"]) - .size() - .nlargest(10) - .reset_index(name="PageViews") - .iloc[10000:10010], lambda x: x.filter( (pl.col("CounterID") == 62) & (pl.col("EventDate") >= datetime(2013, 7, 1)) @@ -743,31 +452,17 @@ ( "Q42", "SELECT DATE_TRUNC('minute', EventTime) AS M, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-14' AND EventDate <= '2013-07-15' AND IsRefresh = 0 AND DontCountHits = 0 GROUP BY DATE_TRUNC('minute', EventTime) ORDER BY DATE_TRUNC('minute', EventTime) LIMIT 10 OFFSET 1000;", - lambda x: x[ - (x["CounterID"] == 62) - & (x["EventDate"] >= "2013-07-14") - & (x["EventDate"] <= "2013-07-15") - & (x["IsRefresh"] == 0) - & (x["DontCountHits"] == 0) - ] - .groupby(pd.Grouper(key="EventTime", freq="T")) - .size() - .reset_index(name="PageViews") - .iloc[1000:1010], lambda x: None, - # Crash with: - # thread '' panicked at crates/polars-time/src/windows/duration.rs:215:21: - # expected leading integer in the duration string, found m - # lambda x: x.filter( - # (pl.col("CounterID") == 62) - # & (pl.col("EventDate") >= datetime(2013, 7, 14)) - # & (pl.col("EventDate") <= datetime(2013, 7, 15)) - # & (pl.col("IsRefresh") == 0) - # & (pl.col("DontCountHits") == 0) - # ) - # .group_by(pl.col("EventTime").dt.truncate("minute")) - # .agg(pl.len().alias("PageViews")) - # .slice(1000, 10), + lambda x: x.filter( + (pl.col("CounterID") == 62) + & (pl.col("EventDate") >= datetime(2013, 7, 14)) + & (pl.col("EventDate") <= datetime(2013, 7, 15)) + & (pl.col("IsRefresh") == 0) + & (pl.col("DontCountHits") == 0) + ) + .group_by(pl.col("EventTime").dt.truncate("minute")) + .agg(pl.len().alias("PageViews")) + .slice(1000, 10), ), ] @@ -776,7 +471,7 @@ times = [] for _ in range(3): start = timeit.default_timer() - result = q[3](pl_df) + result = q[2](lf) end = timeit.default_timer() if result is None: times.append(None) @@ -791,15 +486,11 @@ "cluster_size": 1, "comment": "", "tags": [ - "C++", "column-oriented", - "embedded", - "stateless", - "serverless", "dataframe", ], "load_time": float(load_time), - "data_size": int(dataframe_size), + "data_size": int(lf.collect().estimated_size()), "result": queries_times, } From 5b262bf9371a670fe4e8112655e147add79c93ac Mon Sep 17 00:00:00 2001 From: ritchie Date: Tue, 26 Nov 2024 08:32:08 +0100 Subject: [PATCH 05/10] always update to latest --- polars/benchmark.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/polars/benchmark.sh b/polars/benchmark.sh index 10ad7e61f..1cda7badb 100755 --- a/polars/benchmark.sh +++ b/polars/benchmark.sh @@ -4,7 +4,7 @@ sudo apt-get update sudo apt-get install -y python3-pip -pip install --break-system-packages polars +pip install -U --break-system-packages polars # Download the data wget --no-verbose --continue https://datasets.clickhouse.com/hits_compatible/athena/hits.parquet From d12bfb69524b48ab210122140d7242aae5941cb7 Mon Sep 17 00:00:00 2001 From: ritchie Date: Tue, 26 Nov 2024 08:41:17 +0100 Subject: [PATCH 06/10] fix all queries --- polars/query.py | 39 ++++++++++++++++++++------------------- 1 file changed, 20 insertions(+), 19 deletions(-) diff --git a/polars/query.py b/polars/query.py index feb798f26..ad9ed30ba 100755 --- a/polars/query.py +++ b/polars/query.py @@ -2,7 +2,7 @@ import polars as pl import timeit -from datetime import datetime, date +from datetime import date import json start = timeit.default_timer() @@ -13,7 +13,7 @@ # fix some types df = df.with_columns( - (pl.col("EventTime") * 1000).cast(pl.Datetime(time_unit="ms")), + (pl.col("EventTime") * int(1e6)).cast(pl.Datetime(time_unit="us")), pl.col("EventDate").cast(pl.Date), ) assert df["EventTime"][0].year == 2013 @@ -349,8 +349,8 @@ "SELECT URL, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND DontCountHits = 0 AND IsRefresh = 0 AND URL <> '' GROUP BY URL ORDER BY PageViews DESC LIMIT 10;", lambda x: x.filter( (pl.col("CounterID") == 62) - & (pl.col("EventDate") >= datetime(2013, 7, 1)) - & (pl.col("EventDate") <= datetime(2013, 7, 31)) + & (pl.col("EventDate") >= date(2013, 7, 1)) + & (pl.col("EventDate") <= date(2013, 7, 31)) & (pl.col("DontCountHits") == 0) & (pl.col("IsRefresh") == 0) & (pl.col("URL") != "") @@ -365,8 +365,8 @@ "SELECT Title, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND DontCountHits = 0 AND IsRefresh = 0 AND Title <> '' GROUP BY Title ORDER BY PageViews DESC LIMIT 10;", lambda x: x.filter( (pl.col("CounterID") == 62) - & (pl.col("EventDate") >= datetime(2013, 7, 1)) - & (pl.col("EventDate") <= datetime(2013, 7, 31)) + & (pl.col("EventDate") >= date(2013, 7, 1)) + & (pl.col("EventDate") <= date(2013, 7, 31)) & (pl.col("DontCountHits") == 0) & (pl.col("IsRefresh") == 0) & (pl.col("Title") != "") @@ -381,8 +381,8 @@ "SELECT URL, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND IsLink <> 0 AND IsDownload = 0 GROUP BY URL ORDER BY PageViews DESC LIMIT 10 OFFSET 1000;", lambda x: x.filter( (pl.col("CounterID") == 62) - & (pl.col("EventDate") >= datetime(2013, 7, 1)) - & (pl.col("EventDate") <= datetime(2013, 7, 31)) + & (pl.col("EventDate") >= date(2013, 7, 1)) + & (pl.col("EventDate") <= date(2013, 7, 31)) & (pl.col("IsRefresh") == 0) & (pl.col("IsLink") != 0) & (pl.col("IsDownload") == 0) @@ -397,8 +397,8 @@ "SELECT TraficSourceID, SearchEngineID, AdvEngineID, CASE WHEN (SearchEngineID = 0 AND AdvEngineID = 0) THEN Referer ELSE '' END AS Src, URL AS Dst, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 GROUP BY TraficSourceID, SearchEngineID, AdvEngineID, Src, Dst ORDER BY PageViews DESC LIMIT 10 OFFSET 1000;", lambda x: x.filter( (pl.col("CounterID") == 62) - & (pl.col("EventDate") >= datetime(2013, 7, 1)) - & (pl.col("EventDate") <= datetime(2013, 7, 31)) + & (pl.col("EventDate") >= date(2013, 7, 1)) + & (pl.col("EventDate") <= date(2013, 7, 31)) & (pl.col("IsRefresh") == 0) ) .group_by( @@ -408,22 +408,22 @@ "AdvEngineID", pl.when(pl.col("SearchEngineID").eq(0) & pl.col("AdvEngineID").eq(0)) .then(pl.col("Referer")) - .otherwise("") + .otherwise(pl.lit("")) .alias("Src"), "URL", ] ) .agg(pl.len().alias("PageViews")) .sort("PageViews", descending=True) - .slice(1000, 10), + .slice(1000, 10).collect(), ), ( "Q40", "SELECT URLHash, EventDate, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND TraficSourceID IN (-1, 6) AND RefererHash = 3594120000172545465 GROUP BY URLHash, EventDate ORDER BY PageViews DESC LIMIT 10 OFFSET 100;", lambda x: x.filter( (pl.col("CounterID") == 62) - & (pl.col("EventDate") >= datetime(2013, 7, 1)) - & (pl.col("EventDate") <= datetime(2013, 7, 31)) + & (pl.col("EventDate") >= date(2013, 7, 1)) + & (pl.col("EventDate") <= date(2013, 7, 31)) & (pl.col("IsRefresh") == 0) & (pl.col("TraficSourceID").is_in([-1, 6])) & (pl.col("RefererHash") == 3594120000172545465) @@ -438,8 +438,8 @@ "SELECT WindowClientWidth, WindowClientHeight, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND DontCountHits = 0 AND URLHash = 2868770270353813622 GROUP BY WindowClientWidth, WindowClientHeight ORDER BY PageViews DESC LIMIT 10 OFFSET 10000;", lambda x: x.filter( (pl.col("CounterID") == 62) - & (pl.col("EventDate") >= datetime(2013, 7, 1)) - & (pl.col("EventDate") <= datetime(2013, 7, 31)) + & (pl.col("EventDate") >= date(2013, 7, 1)) + & (pl.col("EventDate") <= date(2013, 7, 31)) & (pl.col("IsRefresh") == 0) & (pl.col("DontCountHits") == 0) & (pl.col("URLHash") == 2868770270353813622) @@ -455,19 +455,20 @@ lambda x: None, lambda x: x.filter( (pl.col("CounterID") == 62) - & (pl.col("EventDate") >= datetime(2013, 7, 14)) - & (pl.col("EventDate") <= datetime(2013, 7, 15)) + & (pl.col("EventDate") >= date(2013, 7, 14)) + & (pl.col("EventDate") <= date(2013, 7, 15)) & (pl.col("IsRefresh") == 0) & (pl.col("DontCountHits") == 0) ) .group_by(pl.col("EventTime").dt.truncate("minute")) .agg(pl.len().alias("PageViews")) - .slice(1000, 10), + .slice(1000, 10).collect(), ), ] queries_times = [] for q in queries: + print(q[0]) times = [] for _ in range(3): start = timeit.default_timer() From f0d18911943a4da82c7fe0a2d73fa1292e98bfc8 Mon Sep 17 00:00:00 2001 From: ritchie Date: Tue, 26 Nov 2024 08:48:55 +0100 Subject: [PATCH 07/10] don't do unicode regex --- polars/query.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/polars/query.py b/polars/query.py index ad9ed30ba..b92ed215a 100755 --- a/polars/query.py +++ b/polars/query.py @@ -6,7 +6,8 @@ import json start = timeit.default_timer() -df = pl.scan_parquet("hits.parquet").collect() +# df = pl.scan_parquet("hits.parquet").collect() +df = pl.read_parquet("hits.parquet", n_rows=int(1e7)) stop = timeit.default_timer() load_time = stop - start @@ -249,7 +250,7 @@ ), ( "Q28", - "SELECT REGEXP_REPLACE(Referer, '^https?://(?:www\\.)?([^/]+)/.*$', '\\1') AS k, AVG(STRLEN(Referer)) AS l, COUNT(*) AS c, MIN(Referer) FROM hits WHERE Referer <> '' GROUP BY k HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25;", + "SELECT REGEXP_REPLACE(Referer, '(?-u)^https?://(?:www\\.)?([^/]+)/.*$', '\\1') AS k, AVG(STRLEN(Referer)) AS l, COUNT(*) AS c, MIN(Referer) FROM hits WHERE Referer <> '' GROUP BY k HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25;", lambda x: ( x.filter(pl.col("Referer") != "") .with_columns( From c4613429ec1c6ee43f1643ce8404da731e0b9bf5 Mon Sep 17 00:00:00 2001 From: ritchie Date: Tue, 26 Nov 2024 09:02:59 +0100 Subject: [PATCH 08/10] undo slice --- polars/query.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/polars/query.py b/polars/query.py index b92ed215a..82a97aa01 100755 --- a/polars/query.py +++ b/polars/query.py @@ -6,8 +6,7 @@ import json start = timeit.default_timer() -# df = pl.scan_parquet("hits.parquet").collect() -df = pl.read_parquet("hits.parquet", n_rows=int(1e7)) +df = pl.scan_parquet("hits.parquet").collect() stop = timeit.default_timer() load_time = stop - start From 5b510928e89d1944806e973c4874b28c84b28cad Mon Sep 17 00:00:00 2001 From: ritchie Date: Thu, 28 Nov 2024 15:26:46 +0100 Subject: [PATCH 09/10] add parquet source --- polars/query.py | 120 ++++++++++++++++++++++++++---------------------- 1 file changed, 65 insertions(+), 55 deletions(-) diff --git a/polars/query.py b/polars/query.py index 82a97aa01..347c6e662 100755 --- a/polars/query.py +++ b/polars/query.py @@ -5,23 +5,7 @@ from datetime import date import json -start = timeit.default_timer() -df = pl.scan_parquet("hits.parquet").collect() -stop = timeit.default_timer() -load_time = stop - start - - -# fix some types -df = df.with_columns( - (pl.col("EventTime") * int(1e6)).cast(pl.Datetime(time_unit="us")), - pl.col("EventDate").cast(pl.Date), -) -assert df["EventTime"][0].year == 2013 -df = df.rechunk() - -lf = df.lazy() - -# 0: No., 1: SQL, 3: Polars +# 0: No., 1: SQL, 2: Polars queries = [ ("Q0", "SELECT COUNT(*) FROM hits;", lambda x: x.select(pl.len()).collect().height), ( @@ -452,7 +436,6 @@ ( "Q42", "SELECT DATE_TRUNC('minute', EventTime) AS M, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-14' AND EventDate <= '2013-07-15' AND IsRefresh = 0 AND DontCountHits = 0 GROUP BY DATE_TRUNC('minute', EventTime) ORDER BY DATE_TRUNC('minute', EventTime) LIMIT 10 OFFSET 1000;", - lambda x: None, lambda x: x.filter( (pl.col("CounterID") == 62) & (pl.col("EventDate") >= date(2013, 7, 14)) @@ -460,47 +443,74 @@ & (pl.col("IsRefresh") == 0) & (pl.col("DontCountHits") == 0) ) - .group_by(pl.col("EventTime").dt.truncate("minute")) + .group_by(pl.col("EventTime").dt.truncate("1m")) .agg(pl.len().alias("PageViews")) .slice(1000, 10).collect(), ), ] -queries_times = [] -for q in queries: - print(q[0]) - times = [] - for _ in range(3): - start = timeit.default_timer() - result = q[2](lf) - end = timeit.default_timer() - if result is None: - times.append(None) - else: - times.append(end - start) - queries_times.append(times) -result_json = { - "system": "Polars (DataFrame)", - "date": date.today().strftime("%Y-%m-%d"), - "machine": "c6a.metal, 500gb gp2", - "cluster_size": 1, - "comment": "", - "tags": [ - "column-oriented", - "dataframe", - ], - "load_time": float(load_time), - "data_size": int(lf.collect().estimated_size()), - "result": queries_times, -} +def run_timings(lf: pl.LazyFrame, name: str, src: str, load_time: int | None) -> None: + queries_times = [] + for q in queries: + print(q[0]) + times = [] + for _ in range(3): + start = timeit.default_timer() + result = q[2](lf) + end = timeit.default_timer() + if result is None: + times.append(None) + else: + times.append(end - start) + queries_times.append(times) + + result_json = { + "system": name, + "date": date.today().strftime("%Y-%m-%d"), + "machine": "c6a.metal, 500gb gp2", + "cluster_size": 1, + "comment": "", + "tags": [ + "column-oriented", + src, + ], + "load_time": float(load_time) if load_time is not None else None, + "result": queries_times, + } + # if cpuinfo contains "AMD EPYC 9654" update machine and write result into results/epyc-9654.json + if "AMD EPYC 9654" in open("/proc/cpuinfo").read(): + result_json["machine"] = "EPYC 9654, 384G" + with open(f"results/{src}_epyc-9654.json", "w") as f: + f.write(json.dumps(result_json, indent=4)) + else: + # write result into results/c6a.metal.json + with open(f"results/{src}_c6a.metal.json", "w") as f: + f.write(json.dumps(result_json, indent=4)) -# if cpuinfo contains "AMD EPYC 9654" update machine and write result into results/epyc-9654.json -if "AMD EPYC 9654" in open("/proc/cpuinfo").read(): - result_json["machine"] = "EPYC 9654, 384G" - with open("results/epyc-9654.json", "w") as f: - f.write(json.dumps(result_json, indent=4)) -else: - # write result into results/c6a.metal.json - with open("results/c6a.metal.json", "w") as f: - f.write(json.dumps(result_json, indent=4)) + +# Run from Parquet +lf = pl.scan_parquet("hits.parquet").with_columns( + (pl.col("EventTime") * int(1e6)).cast(pl.Datetime(time_unit="us")), + pl.col("EventDate").cast(pl.Date), +) +print("run parquet queries") +run_timings(lf, "Polars (Parquet)", "parquet", None) + + +print("run DataFrame (in-memory) queries, this loads all data in memory!") +start = timeit.default_timer() +df = pl.scan_parquet("hits.parquet").collect() +stop = timeit.default_timer() +load_time = stop - start + +# fix some types +df = df.with_columns( + (pl.col("EventTime") * int(1e6)).cast(pl.Datetime(time_unit="us")), + pl.col("EventDate").cast(pl.Date), +) +assert df["EventTime"][0].year == 2013 +df = df.rechunk() + +lf = df.lazy() +run_timings(lf, "Polars (DataFrame)", "DataFrame", load_time) From 1d744275f035f9b4d2e8b0d98f77e472f438bb71 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Fri, 29 Nov 2024 11:41:40 +0000 Subject: [PATCH 10/10] Add results for c6a.metal --- polars/results/DataFrame_c6a.metal.json | 229 +++++++++++++++++++++++ polars/results/c6a.metal.json | 234 ------------------------ polars/results/parquet_c6a.metal.json | 229 +++++++++++++++++++++++ 3 files changed, 458 insertions(+), 234 deletions(-) create mode 100644 polars/results/DataFrame_c6a.metal.json delete mode 100644 polars/results/c6a.metal.json create mode 100644 polars/results/parquet_c6a.metal.json diff --git a/polars/results/DataFrame_c6a.metal.json b/polars/results/DataFrame_c6a.metal.json new file mode 100644 index 000000000..f214f8d26 --- /dev/null +++ b/polars/results/DataFrame_c6a.metal.json @@ -0,0 +1,229 @@ +{ + "system": "Polars (DataFrame)", + "date": "2024-11-29", + "machine": "c6a.metal, 500gb gp2", + "cluster_size": 1, + "comment": "", + "tags": [ + "column-oriented", + "DataFrame" + ], + "load_time": 2.3000679460001265, + "result": [ + [ + 0.00015112200003386533, + 3.7150000025576446e-05, + 2.554100001361803e-05 + ], + [ + 0.03171516099996552, + 0.032293901000002734, + 0.03215773800002353 + ], + [ + 0.4772652639999251, + 0.46750544699989405, + 0.4495371789998899 + ], + [ + 0.0710227990000476, + 0.07084154700010004, + 0.0707792650000556 + ], + [ + 2.057352185000127, + 1.7546763299999384, + 1.163346606999994 + ], + [ + 3.3611794670000563, + 3.361934698999903, + 3.356456425000033 + ], + [ + 0.022322305999978198, + 0.017734573999860004, + 0.017439570000078675 + ], + [ + 0.034810428999890064, + 0.03980527599992456, + 0.037032382999996116 + ], + [ + 1.1927395830000478, + 0.9333954369999446, + 0.986558266999964 + ], + [ + 1.1237861500001145, + 1.0634325710000212, + 1.2359178669998983 + ], + [ + 0.4698280170000544, + 0.4857669729999543, + 0.46914764600001035 + ], + [ + 0.4963598159999947, + 0.5227434319999702, + 0.42942682499983675 + ], + [ + 0.6119570860000749, + 0.6307538849998764, + 0.6839730260001033 + ], + [ + 0.6959008190001441, + 0.7254145639999479, + 0.766673179999998 + ], + [ + 0.750602372000003, + 0.7139416769998661, + 0.6843765409998923 + ], + [ + 0.8596073270000488, + 0.9934823169999163, + 0.9173948990001008 + ], + [ + 2.7692482420000033, + 2.3495000699999764, + 2.2436599499999375 + ], + [ + 2.026880264000056, + 1.9250255050001215, + 1.9470105970001441 + ], + [ + 4.853015905999882, + 4.480639889000031, + 4.586371847999999 + ], + [ + 0.065807199000119, + 0.06525409100004254, + 0.06565585699991061 + ], + [ + 1.9282573609998508, + 1.9262364920000437, + 1.9274114049999298 + ], + [ + 2.1353811700000733, + 2.135909988999856, + 2.139513435000026 + ], + [ + 4.494763337999984, + 4.506743129999904, + 4.513714225000058 + ], + [ + 1.9703748730000825, + 1.9624855600000046, + 1.9467992569998387 + ], + [ + 0.5281160150000233, + 0.5079870420001953, + 0.5058492079999724 + ], + [ + 0.9466686680000294, + 0.9467454900000121, + 0.9417365909998807 + ], + [ + 1.6252639049998834, + 1.6136391739998999, + 1.6105059249998703 + ], + [ + 3.961973265999859, + 3.929348490999928, + 4.038061261999928 + ], + [ + 57.451627619999954, + 57.806593514999804, + 58.076217272999884 + ], + [ + 1.6579584300000079, + 1.5953056620001007, + 1.604066080000166 + ], + [ + 0.787072875999911, + 0.805590096999822, + 0.7736930999999458 + ], + [ + 1.040926324000111, + 1.09222987499993, + 0.9836310939999748 + ], + [ + 6.117114711999875, + 5.623454530000117, + 5.640725126999996 + ], + [ + 3.3983772260000933, + 3.133559688999867, + 3.2032901739999033 + ], + [ + 3.296611066999958, + 3.1726310689998627, + 3.1984622950001267 + ], + [ + 0.5767455929999414, + 0.4501746359999288, + 0.5369838189999427 + ], + [ + 0.3523692209998899, + 0.30068239000001995, + 0.29913635500020064 + ], + [ + 0.7507235030000174, + 0.3421970209999472, + 0.3525650139999925 + ], + [ + 0.2449839159999101, + 0.23656309300008616, + 0.23265344200012805 + ], + [ + 0.25074233399982404, + 0.26460407999979907, + 0.25035662799996317 + ], + [ + 1.4510024630001226, + 1.184202398000025, + 1.171343846000127 + ], + [ + 0.5013381270000536, + 0.28769277099991086, + 0.28513832199996614 + ], + [ + 0.21341560199994092, + 0.2118193969999993, + 0.2055801789999805 + ] + ] +} \ No newline at end of file diff --git a/polars/results/c6a.metal.json b/polars/results/c6a.metal.json deleted file mode 100644 index 97e711221..000000000 --- a/polars/results/c6a.metal.json +++ /dev/null @@ -1,234 +0,0 @@ -{ - "system": "Polars (DataFrame)", - "date": "2024-09-09", - "machine": "c6a.metal, 500gb gp2", - "cluster_size": 1, - "comment": "", - "tags": [ - "C++", - "column-oriented", - "embedded", - "stateless", - "serverless", - "dataframe" - ], - "load_time": 274.5956620259999, - "data_size": 46998823722, - "result": [ - [ - 2.7551000130188186e-05, - 2.0500001483014785e-06, - 3.8999996831989847e-07 - ], - [ - 0.3393856870000036, - 0.08128961900001741, - 0.0792162879999978 - ], - [ - 0.46830895299990516, - 0.4213311490000251, - 0.41986769899995124 - ], - [ - 0.27111638099995616, - 0.0691876239998237, - 0.06879308600014156 - ], - [ - 0.9961474940000699, - 1.1656686840001385, - 1.341034622000052 - ], - [ - 3.293661254999961, - 3.3244774100000996, - 3.31177472100012 - ], - [ - 0.05548804800014295, - 0.05496264599992173, - 0.05497070699993856 - ], - [ - 0.30676389199993537, - 0.11210982800002967, - 0.10042662099999689 - ], - [ - 5.758702494999852, - 1.412199709000106, - 0.9961499670000649 - ], - [ - 1.269696352999972, - 1.0784102180000446, - 1.089092165000011 - ], - [ - 1.5691383170001245, - 0.7789256089999981, - 0.7921542070000669 - ], - [ - 1.0894663530000344, - 0.9373611309999887, - 0.8721730039999329 - ], - [ - 2.494931741999835, - 1.7917862040001182, - 1.7991658439998446 - ], - [ - 95.30520302599984, - 96.30335870499971, - 95.42217894499981 - ], - [ - 1.9459286860001157, - 1.9641287069998725, - 1.9325827739999113 - ], - [ - 1.4602280210001481, - 1.2033372910000253, - 1.3688985580001827 - ], - [ - 3.3947668380001232, - 2.9921093740003926, - 2.996665767000195 - ], - [ - 2.16839200000004, - 2.5575646139996024, - 2.477013052000075 - ], - [ - 5.922572512999977, - 6.192915480000011, - 7.058452599999782 - ], - [ - 0.21617201800017938, - 0.051977289999740606, - 0.05087722700000086 - ], - [ - 6.605031917999895, - 5.489592963999712, - 3.316997033000007 - ], - [ - 2.9719723640000666, - 1.9411770279998564, - 1.9698260360000859 - ], - [ - 5.622727766000025, - 4.052559812999789, - 4.165921265999714 - ], - [ - 2.1939042029998745, - 2.0240198039996358, - 2.0558673149998867 - ], - [ - 3.2231196239999917, - 2.574916228999882, - 2.6234278580000137 - ], - [ - 3.1981313249998493, - 3.2646530639999582, - 3.29449439300015 - ], - [ - 2.607552430999931, - 2.5564862259998336, - 2.6211176460001298 - ], - [ - 8.782033818999935, - 9.022877838000113, - 8.618930766000176 - ], - [ - 65.78594618099987, - 65.80185137000035, - 65.21734767399994 - ], - [ - 0.04148712999995041, - 0.00017722400025377283, - 2.6159999833907932e-05 - ], - [ - 5.197321627000292, - 1.8464567339997302, - 1.8262284829997952 - ], - [ - 2.020474259999901, - 2.1032231610001872, - 2.150142378000055 - ], - [ - 6.235942041999806, - 8.480420082000364, - 7.408725919000062 - ], - [ - 3.7926399410002887, - 3.562234329000148, - 3.638766590999694 - ], - [ - 3.3556073149998156, - 3.7019931729996642, - 3.4804760279998845 - ], - [ - 0.5585857320002106, - 0.6201126340001792, - 0.6397678640000777 - ], - [ - 20.811287902000004, - 20.985524244000317, - 20.879136653000387 - ], - [ - 18.704310231999898, - 19.610116691000258, - 18.715476107000086 - ], - [ - 7.068463735000023, - 7.329527066000082, - 7.057720248999885 - ], - [ - null, - null, - null - ], - [ - 2.366720836999775, - 2.364191364999897, - 2.4704000149999956 - ], - [ - 0.06867151899996315, - 0.06851112599997577, - 0.09032966999984637 - ], - [ - null, - null, - null - ] - ] -} \ No newline at end of file diff --git a/polars/results/parquet_c6a.metal.json b/polars/results/parquet_c6a.metal.json new file mode 100644 index 000000000..b75ce3680 --- /dev/null +++ b/polars/results/parquet_c6a.metal.json @@ -0,0 +1,229 @@ +{ + "system": "Polars (Parquet)", + "date": "2024-11-29", + "machine": "c6a.metal, 500gb gp2", + "cluster_size": 1, + "comment": "", + "tags": [ + "column-oriented", + "parquet" + ], + "load_time": null, + "result": [ + [ + 0.07331929400004356, + 0.0313942400000542, + 0.0295857439999736 + ], + [ + 0.056858242999965114, + 0.03603707900003883, + 0.03635798200002682 + ], + [ + 0.3601835890000302, + 0.12495236099994145, + 0.36494900799993957 + ], + [ + 0.1014015370000152, + 0.10379579100003866, + 0.09615968900004646 + ], + [ + 1.8475413349999599, + 1.2111640870000429, + 1.3070971719999989 + ], + [ + 3.073540879999996, + 2.6459892290000653, + 2.6156991359999893 + ], + [ + 0.06517877500004943, + 0.053004126000018914, + 0.05571083600000293 + ], + [ + 0.026876204000018333, + 0.015052019999984623, + 0.013367295999955786 + ], + [ + 1.4109252119999383, + 1.5793965889999981, + 1.3722895870000684 + ], + [ + 1.635907786999951, + 1.7955330880000702, + 1.8128875789999483 + ], + [ + 0.29154992499991295, + 0.32512125999994623, + 0.3547269459999143 + ], + [ + 0.39792985199994746, + 0.3251884109999992, + 0.38865446600004816 + ], + [ + 2.138931853000031, + 2.105218915000023, + 2.1956906379999737 + ], + [ + 1.6526281679999784, + 1.6124568860000181, + 1.6333364800000254 + ], + [ + 1.6447148869999637, + 1.6346969300000183, + 1.6644061450000436 + ], + [ + 2.1882379500000297, + 1.905917633999934, + 1.9231305759999486 + ], + [ + 5.352270128999976, + 5.237135877000014, + 5.308867133000035 + ], + [ + 5.853392568999993, + 4.927742889000001, + 4.913877669000044 + ], + [ + 7.844565224999997, + 7.873020868000026, + 8.710109652000028 + ], + [ + 0.08927957200000947, + 0.028090512999938255, + 0.02795156100000895 + ], + [ + 0.4280026709999447, + 0.3811629729999595, + 0.39364887600004295 + ], + [ + 0.45465083199997025, + 0.4768754289999606, + 0.48719940199998746 + ], + [ + 0.7863661489999458, + 0.7864010189999817, + 0.7998755679999476 + ], + [ + 1.8717920940000567, + 1.8261706829999866, + 2.037369308000052 + ], + [ + 0.8631901180000341, + 0.7759028259999923, + 0.7840361930000199 + ], + [ + 1.2422713079999994, + 1.1825194030000148, + 1.3301276820000112 + ], + [ + 1.9708939380000174, + 2.0363808529999687, + 1.949735128000043 + ], + [ + 9.631319166000026, + 9.65408143600007, + 9.62223931699998 + ], + [ + 12.509586186999968, + 12.429948261999925, + 13.083593043000064 + ], + [ + 1.6713843260000658, + 1.5654188460000569, + 1.2058937670000205 + ], + [ + 0.874348419999933, + 0.8206400289999465, + 0.8770680310000216 + ], + [ + 1.471688980999943, + 1.4348052110000253, + 1.5063776990000406 + ], + [ + 6.304678343999967, + 6.261220233000017, + 6.6019086710000465 + ], + [ + 21.32528498800002, + 21.617486471999996, + 21.69860179400007 + ], + [ + 21.58728936099999, + 22.069379411, + 22.213318467999898 + ], + [ + 1.6370663789999753, + 1.123244365000005, + 1.0656513379999524 + ], + [ + 0.22456605399997898, + 0.2239348240000254, + 0.22996692600008828 + ], + [ + 0.15421211200009566, + 0.1544310240001323, + 0.15089476100001775 + ], + [ + 0.07680750100007572, + 0.0744040249999216, + 0.07324702699997943 + ], + [ + 0.2344550140001047, + 0.25678796600004716, + 0.24425511999993432 + ], + [ + 1.0822417379999933, + 0.8278360040001189, + 0.8236462620000111 + ], + [ + 0.01962476800008517, + 0.017242483000018183, + 0.018496377000019493 + ], + [ + 0.015187977999858049, + 0.016929013999970266, + 0.016555773999925805 + ] + ] +} \ No newline at end of file