From 9a2e8f789915d05e3d4bd957b6b992f6e59dd94a Mon Sep 17 00:00:00 2001 From: Gabor Szarnyas Date: Tue, 26 Nov 2024 20:33:19 +0100 Subject: [PATCH] DuckDB: Add in-memory results --- duckdb-memory/benchmark.sh | 22 ++++++ duckdb-memory/create.sql | 108 +++++++++++++++++++++++++++ duckdb-memory/memory.py | 16 ++++ duckdb-memory/queries.sql | 43 +++++++++++ duckdb-memory/query.py | 36 +++++++++ duckdb-memory/results/c6a.metal.json | 56 ++++++++++++++ 6 files changed, 281 insertions(+) create mode 100755 duckdb-memory/benchmark.sh create mode 100644 duckdb-memory/create.sql create mode 100755 duckdb-memory/memory.py create mode 100644 duckdb-memory/queries.sql create mode 100755 duckdb-memory/query.py create mode 100644 duckdb-memory/results/c6a.metal.json diff --git a/duckdb-memory/benchmark.sh b/duckdb-memory/benchmark.sh new file mode 100755 index 000000000..3cd249fac --- /dev/null +++ b/duckdb-memory/benchmark.sh @@ -0,0 +1,22 @@ +#!/bin/bash + +# Install + +sudo apt-get update +sudo apt-get install -y python3-pip +pip install --break-system-packages duckdb==1.1.3 psutil + +# Load the data + +wget --no-verbose --continue 'https://datasets.clickhouse.com/hits_compatible/hits.csv.gz' +gzip -d hits.csv.gz + +# Run the queries + +./query.py | tee log.txt 2>&1 + +cat log.txt | grep -P '^\d|Killed|Segmentation' | sed -r -e 's/^.*(Killed|Segmentation).*$/null\nnull\nnull/' | + awk '{ if (i % 3 == 0) { printf "[" }; printf $1; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; }' + +/usr/bin/time -v ./memory.py + diff --git a/duckdb-memory/create.sql b/duckdb-memory/create.sql new file mode 100644 index 000000000..4d23eaac6 --- /dev/null +++ b/duckdb-memory/create.sql @@ -0,0 +1,108 @@ +CREATE TABLE hits +( + WatchID BIGINT NOT NULL, + JavaEnable SMALLINT NOT NULL, + Title TEXT, + GoodEvent SMALLINT NOT NULL, + EventTime TIMESTAMP NOT NULL, + EventDate Date NOT NULL, + CounterID INTEGER NOT NULL, + ClientIP INTEGER NOT NULL, + RegionID INTEGER NOT NULL, + UserID BIGINT NOT NULL, + CounterClass SMALLINT NOT NULL, + OS SMALLINT NOT NULL, + UserAgent SMALLINT NOT NULL, + URL TEXT, + Referer TEXT, + IsRefresh SMALLINT NOT NULL, + RefererCategoryID SMALLINT NOT NULL, + RefererRegionID INTEGER NOT NULL, + URLCategoryID SMALLINT NOT NULL, + URLRegionID INTEGER NOT NULL, + ResolutionWidth SMALLINT NOT NULL, + ResolutionHeight SMALLINT NOT NULL, + ResolutionDepth SMALLINT NOT NULL, + FlashMajor SMALLINT NOT NULL, + FlashMinor SMALLINT NOT NULL, + FlashMinor2 TEXT, + NetMajor SMALLINT NOT NULL, + NetMinor SMALLINT NOT NULL, + UserAgentMajor SMALLINT NOT NULL, + UserAgentMinor VARCHAR(255) NOT NULL, + CookieEnable SMALLINT NOT NULL, + JavascriptEnable SMALLINT NOT NULL, + IsMobile SMALLINT NOT NULL, + MobilePhone SMALLINT NOT NULL, + MobilePhoneModel TEXT, + Params TEXT, + IPNetworkID INTEGER NOT NULL, + TraficSourceID SMALLINT NOT NULL, + SearchEngineID SMALLINT NOT NULL, + SearchPhrase TEXT, + AdvEngineID SMALLINT NOT NULL, + IsArtifical SMALLINT NOT NULL, + WindowClientWidth SMALLINT NOT NULL, + WindowClientHeight SMALLINT NOT NULL, + ClientTimeZone SMALLINT NOT NULL, + ClientEventTime TIMESTAMP NOT NULL, + SilverlightVersion1 SMALLINT NOT NULL, + SilverlightVersion2 SMALLINT NOT NULL, + SilverlightVersion3 INTEGER NOT NULL, + SilverlightVersion4 SMALLINT NOT NULL, + PageCharset TEXT, + CodeVersion INTEGER NOT NULL, + IsLink SMALLINT NOT NULL, + IsDownload SMALLINT NOT NULL, + IsNotBounce SMALLINT NOT NULL, + FUniqID BIGINT NOT NULL, + OriginalURL TEXT, + HID INTEGER NOT NULL, + IsOldCounter SMALLINT NOT NULL, + IsEvent SMALLINT NOT NULL, + IsParameter SMALLINT NOT NULL, + DontCountHits SMALLINT NOT NULL, + WithHash SMALLINT NOT NULL, + HitColor CHAR NOT NULL, + LocalEventTime TIMESTAMP NOT NULL, + Age SMALLINT NOT NULL, + Sex SMALLINT NOT NULL, + Income SMALLINT NOT NULL, + Interests SMALLINT NOT NULL, + Robotness SMALLINT NOT NULL, + RemoteIP INTEGER NOT NULL, + WindowName INTEGER NOT NULL, + OpenerName INTEGER NOT NULL, + HistoryLength SMALLINT NOT NULL, + BrowserLanguage TEXT, + BrowserCountry TEXT, + SocialNetwork TEXT, + SocialAction TEXT, + HTTPError SMALLINT NOT NULL, + SendTiming INTEGER NOT NULL, + DNSTiming INTEGER NOT NULL, + ConnectTiming INTEGER NOT NULL, + ResponseStartTiming INTEGER NOT NULL, + ResponseEndTiming INTEGER NOT NULL, + FetchTiming INTEGER NOT NULL, + SocialSourceNetworkID SMALLINT NOT NULL, + SocialSourcePage TEXT, + ParamPrice BIGINT NOT NULL, + ParamOrderID TEXT, + ParamCurrency TEXT, + ParamCurrencyID SMALLINT NOT NULL, + OpenstatServiceName TEXT, + OpenstatCampaignID TEXT, + OpenstatAdID TEXT, + OpenstatSourceID TEXT, + UTMSource TEXT, + UTMMedium TEXT, + UTMCampaign TEXT, + UTMContent TEXT, + UTMTerm TEXT, + FromTag TEXT, + HasGCLID SMALLINT NOT NULL, + RefererHash BIGINT NOT NULL, + URLHash BIGINT NOT NULL, + CLID INTEGER NOT NULL +); diff --git a/duckdb-memory/memory.py b/duckdb-memory/memory.py new file mode 100755 index 000000000..30966808d --- /dev/null +++ b/duckdb-memory/memory.py @@ -0,0 +1,16 @@ +#!/usr/bin/env python3 + +import duckdb + +# Load the data to determine the memory use. +# This load is not timed. +con = duckdb.connect(read_only=False) + +# enable the progress bar +con.execute('PRAGMA enable_progress_bar;') +con.execute('PRAGMA enable_print_progress_bar;') +# disable preservation of insertion order +con.execute("SET preserve_insertion_order = false;") + +con.execute(open("create.sql").read()) +con.execute("COPY hits FROM 'hits.csv';") diff --git a/duckdb-memory/queries.sql b/duckdb-memory/queries.sql new file mode 100644 index 000000000..b4115ee3a --- /dev/null +++ b/duckdb-memory/queries.sql @@ -0,0 +1,43 @@ +SELECT COUNT(*) FROM hits; +SELECT COUNT(*) FROM hits WHERE AdvEngineID <> 0; +SELECT SUM(AdvEngineID), COUNT(*), AVG(ResolutionWidth) FROM hits; +SELECT AVG(UserID) FROM hits; +SELECT COUNT(DISTINCT UserID) FROM hits; +SELECT COUNT(DISTINCT SearchPhrase) FROM hits; +SELECT MIN(EventDate), MAX(EventDate) FROM hits; +SELECT AdvEngineID, COUNT(*) FROM hits WHERE AdvEngineID <> 0 GROUP BY AdvEngineID ORDER BY COUNT(*) DESC; +SELECT RegionID, COUNT(DISTINCT UserID) AS u FROM hits GROUP BY RegionID ORDER BY u DESC LIMIT 10; +SELECT RegionID, SUM(AdvEngineID), COUNT(*) AS c, AVG(ResolutionWidth), COUNT(DISTINCT UserID) FROM hits GROUP BY RegionID ORDER BY c DESC LIMIT 10; +SELECT MobilePhoneModel, COUNT(DISTINCT UserID) AS u FROM hits WHERE MobilePhoneModel <> '' GROUP BY MobilePhoneModel ORDER BY u DESC LIMIT 10; +SELECT MobilePhone, MobilePhoneModel, COUNT(DISTINCT UserID) AS u FROM hits WHERE MobilePhoneModel <> '' GROUP BY MobilePhone, MobilePhoneModel ORDER BY u DESC LIMIT 10; +SELECT SearchPhrase, COUNT(*) AS c FROM hits WHERE SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10; +SELECT SearchPhrase, COUNT(DISTINCT UserID) AS u FROM hits WHERE SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY u DESC LIMIT 10; +SELECT SearchEngineID, SearchPhrase, COUNT(*) AS c FROM hits WHERE SearchPhrase <> '' GROUP BY SearchEngineID, SearchPhrase ORDER BY c DESC LIMIT 10; +SELECT UserID, COUNT(*) FROM hits GROUP BY UserID ORDER BY COUNT(*) DESC LIMIT 10; +SELECT UserID, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, SearchPhrase ORDER BY COUNT(*) DESC LIMIT 10; +SELECT UserID, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, SearchPhrase LIMIT 10; +SELECT UserID, extract(minute FROM EventTime) AS m, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, m, SearchPhrase ORDER BY COUNT(*) DESC LIMIT 10; +SELECT UserID FROM hits WHERE UserID = 435090932899640449; +SELECT COUNT(*) FROM hits WHERE URL LIKE '%google%'; +SELECT SearchPhrase, MIN(URL), COUNT(*) AS c FROM hits WHERE URL LIKE '%google%' AND SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10; +SELECT SearchPhrase, MIN(URL), MIN(Title), COUNT(*) AS c, COUNT(DISTINCT UserID) FROM hits WHERE Title LIKE '%Google%' AND URL NOT LIKE '%.google.%' AND SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10; +SELECT * FROM hits WHERE URL LIKE '%google%' ORDER BY EventTime LIMIT 10; +SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY EventTime LIMIT 10; +SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY SearchPhrase LIMIT 10; +SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY EventTime, SearchPhrase LIMIT 10; +SELECT CounterID, AVG(STRLEN(URL)) AS l, COUNT(*) AS c FROM hits WHERE URL <> '' GROUP BY CounterID HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25; +SELECT REGEXP_REPLACE(Referer, '^https?://(?:www\.)?([^/]+)/.*$', '\1') AS k, AVG(STRLEN(Referer)) AS l, COUNT(*) AS c, MIN(Referer) FROM hits WHERE Referer <> '' GROUP BY k HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25; +SELECT SUM(ResolutionWidth), SUM(ResolutionWidth + 1), SUM(ResolutionWidth + 2), SUM(ResolutionWidth + 3), SUM(ResolutionWidth + 4), SUM(ResolutionWidth + 5), SUM(ResolutionWidth + 6), SUM(ResolutionWidth + 7), SUM(ResolutionWidth + 8), SUM(ResolutionWidth + 9), SUM(ResolutionWidth + 10), SUM(ResolutionWidth + 11), SUM(ResolutionWidth + 12), SUM(ResolutionWidth + 13), SUM(ResolutionWidth + 14), SUM(ResolutionWidth + 15), SUM(ResolutionWidth + 16), SUM(ResolutionWidth + 17), SUM(ResolutionWidth + 18), SUM(ResolutionWidth + 19), SUM(ResolutionWidth + 20), SUM(ResolutionWidth + 21), SUM(ResolutionWidth + 22), SUM(ResolutionWidth + 23), SUM(ResolutionWidth + 24), SUM(ResolutionWidth + 25), SUM(ResolutionWidth + 26), SUM(ResolutionWidth + 27), SUM(ResolutionWidth + 28), SUM(ResolutionWidth + 29), SUM(ResolutionWidth + 30), SUM(ResolutionWidth + 31), SUM(ResolutionWidth + 32), SUM(ResolutionWidth + 33), SUM(ResolutionWidth + 34), SUM(ResolutionWidth + 35), SUM(ResolutionWidth + 36), SUM(ResolutionWidth + 37), SUM(ResolutionWidth + 38), SUM(ResolutionWidth + 39), SUM(ResolutionWidth + 40), SUM(ResolutionWidth + 41), SUM(ResolutionWidth + 42), SUM(ResolutionWidth + 43), SUM(ResolutionWidth + 44), SUM(ResolutionWidth + 45), SUM(ResolutionWidth + 46), SUM(ResolutionWidth + 47), SUM(ResolutionWidth + 48), SUM(ResolutionWidth + 49), SUM(ResolutionWidth + 50), SUM(ResolutionWidth + 51), SUM(ResolutionWidth + 52), SUM(ResolutionWidth + 53), SUM(ResolutionWidth + 54), SUM(ResolutionWidth + 55), SUM(ResolutionWidth + 56), SUM(ResolutionWidth + 57), SUM(ResolutionWidth + 58), SUM(ResolutionWidth + 59), SUM(ResolutionWidth + 60), SUM(ResolutionWidth + 61), SUM(ResolutionWidth + 62), SUM(ResolutionWidth + 63), SUM(ResolutionWidth + 64), SUM(ResolutionWidth + 65), SUM(ResolutionWidth + 66), SUM(ResolutionWidth + 67), SUM(ResolutionWidth + 68), SUM(ResolutionWidth + 69), SUM(ResolutionWidth + 70), SUM(ResolutionWidth + 71), SUM(ResolutionWidth + 72), SUM(ResolutionWidth + 73), SUM(ResolutionWidth + 74), SUM(ResolutionWidth + 75), SUM(ResolutionWidth + 76), SUM(ResolutionWidth + 77), SUM(ResolutionWidth + 78), SUM(ResolutionWidth + 79), SUM(ResolutionWidth + 80), SUM(ResolutionWidth + 81), SUM(ResolutionWidth + 82), SUM(ResolutionWidth + 83), SUM(ResolutionWidth + 84), SUM(ResolutionWidth + 85), SUM(ResolutionWidth + 86), SUM(ResolutionWidth + 87), SUM(ResolutionWidth + 88), SUM(ResolutionWidth + 89) FROM hits; +SELECT SearchEngineID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits WHERE SearchPhrase <> '' GROUP BY SearchEngineID, ClientIP ORDER BY c DESC LIMIT 10; +SELECT WatchID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits WHERE SearchPhrase <> '' GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10; +SELECT WatchID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10; +SELECT URL, COUNT(*) AS c FROM hits GROUP BY URL ORDER BY c DESC LIMIT 10; +SELECT 1, URL, COUNT(*) AS c FROM hits GROUP BY 1, URL ORDER BY c DESC LIMIT 10; +SELECT ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3, COUNT(*) AS c FROM hits GROUP BY ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3 ORDER BY c DESC LIMIT 10; +SELECT URL, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND DontCountHits = 0 AND IsRefresh = 0 AND URL <> '' GROUP BY URL ORDER BY PageViews DESC LIMIT 10; +SELECT Title, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND DontCountHits = 0 AND IsRefresh = 0 AND Title <> '' GROUP BY Title ORDER BY PageViews DESC LIMIT 10; +SELECT URL, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND IsLink <> 0 AND IsDownload = 0 GROUP BY URL ORDER BY PageViews DESC LIMIT 10 OFFSET 1000; +SELECT TraficSourceID, SearchEngineID, AdvEngineID, CASE WHEN (SearchEngineID = 0 AND AdvEngineID = 0) THEN Referer ELSE '' END AS Src, URL AS Dst, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 GROUP BY TraficSourceID, SearchEngineID, AdvEngineID, Src, Dst ORDER BY PageViews DESC LIMIT 10 OFFSET 1000; +SELECT URLHash, EventDate, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND TraficSourceID IN (-1, 6) AND RefererHash = 3594120000172545465 GROUP BY URLHash, EventDate ORDER BY PageViews DESC LIMIT 10 OFFSET 100; +SELECT WindowClientWidth, WindowClientHeight, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND DontCountHits = 0 AND URLHash = 2868770270353813622 GROUP BY WindowClientWidth, WindowClientHeight ORDER BY PageViews DESC LIMIT 10 OFFSET 10000; +SELECT DATE_TRUNC('minute', EventTime) AS M, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-14' AND EventDate <= '2013-07-15' AND IsRefresh = 0 AND DontCountHits = 0 GROUP BY DATE_TRUNC('minute', EventTime) ORDER BY DATE_TRUNC('minute', EventTime) LIMIT 10 OFFSET 1000; diff --git a/duckdb-memory/query.py b/duckdb-memory/query.py new file mode 100755 index 000000000..5e6d104af --- /dev/null +++ b/duckdb-memory/query.py @@ -0,0 +1,36 @@ +#!/usr/bin/env python3 + +import duckdb +import timeit +import sys +import os + +con = duckdb.connect(read_only=False) + +# enable the progress bar +con.execute('PRAGMA enable_progress_bar;') +con.execute('PRAGMA enable_print_progress_bar;') +# disable preservation of insertion order +con.execute("SET preserve_insertion_order = false;") + +# perform the actual load +print("Will load the data") +start = timeit.default_timer() +con.execute(open("create.sql").read()) +con.execute("COPY hits FROM 'hits.csv';") +end = timeit.default_timer() +print(end - start) + +with open('queries.sql', 'r') as file: + for query in file: + os.system("sync") + os.system("echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null") + + print(query) + + for try_num in range(3): + start = timeit.default_timer() + results = con.sql(query).fetchall() + end = timeit.default_timer() + print(end - start) + del results diff --git a/duckdb-memory/results/c6a.metal.json b/duckdb-memory/results/c6a.metal.json new file mode 100644 index 000000000..08caa5b73 --- /dev/null +++ b/duckdb-memory/results/c6a.metal.json @@ -0,0 +1,56 @@ +{ + "system": "DuckDB (memory)", + "date": "2024-11-26", + "machine": "c6a.metal, 500gb gp2", + "cluster_size": 1, + + "tags": ["C++", "column-oriented", "embedded"], + + "load_time": 29.568019166999875, + "data_size": 102315057152, + "result": [ + [28.37229389499771,0.01444745900153066,0.010460285000590375], + [0.00818797199826804,0.018453592998412205,0.010097247002704535], + [0.009875684001599438,0.014756544002011651,0.013079602998914197], + [0.01480884600096033,0.018506574000639375,0.01780792100180406], + [0.02132975700078532,0.8275158579999697,0.24229677399853244], + [0.2527651769996737,0.19887691599797108,0.18186214999877848], + [0.1886556860008568,0.01614768999934313,0.011127936999400845], + [0.012829299001168692,0.017017687001498416,0.011138907000713516], + [0.012957981001818553,0.24885185500170337,0.2766447909998533], + [0.25002359700010857,0.2892966260005778,0.2779514459980419], + [0.30130979000023217,0.08444572000007611,0.06657531800010474], + [0.059446135001053335,0.07427780000216444,0.06103535399961402], + [0.058512896997854114,0.2084252730019216,0.2086321480019251], + [0.21721077699839952,0.46506098299869336,0.4530917009979021], + [0.4697990730019228,0.22253320099844132,0.21058130000164965], + [0.20870386100068572,0.3215249130007578,0.27229407300183084], + [0.2499453230011568,0.3683013289992232,0.31991668999762624], + [0.28497551399777876,0.3048137890000362,0.3115999949986872], + [0.318867735000822,0.8751203950014315,0.7716744879980979], + [0.7994624790007947,0.01659778099929099,0.01250473500113003], + [0.012601426999026444,0.3383350500007509,0.09837473600055091], + [0.09575407699958305,0.08805634300006204,0.10064015900206869], + [0.0844710359997407,0.4643428749986924,0.19186990199887077], + [0.19248208299904945,1.3182347119982296,0.5620290089973423], + [0.5513104279998515,0.06911592700271285,0.05588764800268109], + [0.062015663999773096,0.04412370800127974,0.049542190001375275], + [0.04080415600037668,0.10599739000099362,0.1094067330013786], + [0.10183726099785417,0.08797964200130082,0.08413135899900226], + [0.08379066300039995,2.008289692999824,2.001949085999513], + [1.925026742999762,0.0873897500023304,0.0680545590003021], + [0.0829120550006337,0.18141677500170772,0.1736573039997893], + [0.1532601569997496,0.18473046199869714,0.16197961100260727], + [0.16216968500157236,0.6304102140020404,0.6036183989999699], + [0.5863511949974054,0.5486020050011575,0.5516450720024295], + [0.5755033479981648,0.5668365320016164,0.5793202010027017], + [0.5245683149987599,0.24902728499728255,0.1521922360007011], + [0.2654282980001881,0.07863214099779725,0.050029524998535635], + [0.024858904998836806,0.06018337899877224,0.017208747998665785], + [0.023967166998772882,0.04589027600013651,0.06055860699780169], + [0.023993877999600954,0.08684039799845777,0.04880085199692985], + [0.04033727000205545,0.013630619996547466,0.035837274001096375], + [0.008728707001864677,0.011268315996858291,0.009220824998919852], + [0.018453843000315828,0.014310172999103088,0.007255508000525879] + ] +}