Skip to content

Commit

Permalink
fix all queries
Browse files Browse the repository at this point in the history
  • Loading branch information
ritchie46 committed Nov 26, 2024
1 parent 5b262bf commit d12bfb6
Showing 1 changed file with 20 additions and 19 deletions.
39 changes: 20 additions & 19 deletions polars/query.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import polars as pl
import timeit
from datetime import datetime, date
from datetime import date
import json

start = timeit.default_timer()
Expand All @@ -13,7 +13,7 @@

# fix some types
df = df.with_columns(
(pl.col("EventTime") * 1000).cast(pl.Datetime(time_unit="ms")),
(pl.col("EventTime") * int(1e6)).cast(pl.Datetime(time_unit="us")),
pl.col("EventDate").cast(pl.Date),
)
assert df["EventTime"][0].year == 2013
Expand Down Expand Up @@ -349,8 +349,8 @@
"SELECT URL, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND DontCountHits = 0 AND IsRefresh = 0 AND URL <> '' GROUP BY URL ORDER BY PageViews DESC LIMIT 10;",
lambda x: x.filter(
(pl.col("CounterID") == 62)
& (pl.col("EventDate") >= datetime(2013, 7, 1))
& (pl.col("EventDate") <= datetime(2013, 7, 31))
& (pl.col("EventDate") >= date(2013, 7, 1))
& (pl.col("EventDate") <= date(2013, 7, 31))
& (pl.col("DontCountHits") == 0)
& (pl.col("IsRefresh") == 0)
& (pl.col("URL") != "")
Expand All @@ -365,8 +365,8 @@
"SELECT Title, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND DontCountHits = 0 AND IsRefresh = 0 AND Title <> '' GROUP BY Title ORDER BY PageViews DESC LIMIT 10;",
lambda x: x.filter(
(pl.col("CounterID") == 62)
& (pl.col("EventDate") >= datetime(2013, 7, 1))
& (pl.col("EventDate") <= datetime(2013, 7, 31))
& (pl.col("EventDate") >= date(2013, 7, 1))
& (pl.col("EventDate") <= date(2013, 7, 31))
& (pl.col("DontCountHits") == 0)
& (pl.col("IsRefresh") == 0)
& (pl.col("Title") != "")
Expand All @@ -381,8 +381,8 @@
"SELECT URL, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND IsLink <> 0 AND IsDownload = 0 GROUP BY URL ORDER BY PageViews DESC LIMIT 10 OFFSET 1000;",
lambda x: x.filter(
(pl.col("CounterID") == 62)
& (pl.col("EventDate") >= datetime(2013, 7, 1))
& (pl.col("EventDate") <= datetime(2013, 7, 31))
& (pl.col("EventDate") >= date(2013, 7, 1))
& (pl.col("EventDate") <= date(2013, 7, 31))
& (pl.col("IsRefresh") == 0)
& (pl.col("IsLink") != 0)
& (pl.col("IsDownload") == 0)
Expand All @@ -397,8 +397,8 @@
"SELECT TraficSourceID, SearchEngineID, AdvEngineID, CASE WHEN (SearchEngineID = 0 AND AdvEngineID = 0) THEN Referer ELSE '' END AS Src, URL AS Dst, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 GROUP BY TraficSourceID, SearchEngineID, AdvEngineID, Src, Dst ORDER BY PageViews DESC LIMIT 10 OFFSET 1000;",
lambda x: x.filter(
(pl.col("CounterID") == 62)
& (pl.col("EventDate") >= datetime(2013, 7, 1))
& (pl.col("EventDate") <= datetime(2013, 7, 31))
& (pl.col("EventDate") >= date(2013, 7, 1))
& (pl.col("EventDate") <= date(2013, 7, 31))
& (pl.col("IsRefresh") == 0)
)
.group_by(
Expand All @@ -408,22 +408,22 @@
"AdvEngineID",
pl.when(pl.col("SearchEngineID").eq(0) & pl.col("AdvEngineID").eq(0))
.then(pl.col("Referer"))
.otherwise("")
.otherwise(pl.lit(""))
.alias("Src"),
"URL",
]
)
.agg(pl.len().alias("PageViews"))
.sort("PageViews", descending=True)
.slice(1000, 10),
.slice(1000, 10).collect(),
),
(
"Q40",
"SELECT URLHash, EventDate, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND TraficSourceID IN (-1, 6) AND RefererHash = 3594120000172545465 GROUP BY URLHash, EventDate ORDER BY PageViews DESC LIMIT 10 OFFSET 100;",
lambda x: x.filter(
(pl.col("CounterID") == 62)
& (pl.col("EventDate") >= datetime(2013, 7, 1))
& (pl.col("EventDate") <= datetime(2013, 7, 31))
& (pl.col("EventDate") >= date(2013, 7, 1))
& (pl.col("EventDate") <= date(2013, 7, 31))
& (pl.col("IsRefresh") == 0)
& (pl.col("TraficSourceID").is_in([-1, 6]))
& (pl.col("RefererHash") == 3594120000172545465)
Expand All @@ -438,8 +438,8 @@
"SELECT WindowClientWidth, WindowClientHeight, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND DontCountHits = 0 AND URLHash = 2868770270353813622 GROUP BY WindowClientWidth, WindowClientHeight ORDER BY PageViews DESC LIMIT 10 OFFSET 10000;",
lambda x: x.filter(
(pl.col("CounterID") == 62)
& (pl.col("EventDate") >= datetime(2013, 7, 1))
& (pl.col("EventDate") <= datetime(2013, 7, 31))
& (pl.col("EventDate") >= date(2013, 7, 1))
& (pl.col("EventDate") <= date(2013, 7, 31))
& (pl.col("IsRefresh") == 0)
& (pl.col("DontCountHits") == 0)
& (pl.col("URLHash") == 2868770270353813622)
Expand All @@ -455,19 +455,20 @@
lambda x: None,
lambda x: x.filter(
(pl.col("CounterID") == 62)
& (pl.col("EventDate") >= datetime(2013, 7, 14))
& (pl.col("EventDate") <= datetime(2013, 7, 15))
& (pl.col("EventDate") >= date(2013, 7, 14))
& (pl.col("EventDate") <= date(2013, 7, 15))
& (pl.col("IsRefresh") == 0)
& (pl.col("DontCountHits") == 0)
)
.group_by(pl.col("EventTime").dt.truncate("minute"))
.agg(pl.len().alias("PageViews"))
.slice(1000, 10),
.slice(1000, 10).collect(),
),
]

queries_times = []
for q in queries:
print(q[0])
times = []
for _ in range(3):
start = timeit.default_timer()
Expand Down

0 comments on commit d12bfb6

Please sign in to comment.