From 4b2f257a48ec3e0eb82ece72c02142e483619f3f Mon Sep 17 00:00:00 2001 From: ruslandoga <67764432+ruslandoga@users.noreply.github.com> Date: Fri, 18 Oct 2024 11:47:19 +0700 Subject: [PATCH] update readme --- CHANGELOG.md | 4 + README.md | 224 +++++++++++++-------------------------------------- lib/ch.ex | 10 +-- 3 files changed, 66 insertions(+), 172 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index cb8dfe8..2904c7a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,9 @@ # Changelog +## Unreleased + +- move rows for INSERT from `params` to `statement` + ## 0.2.8 (2024-09-06) - support named tuples https://github.com/plausible/ch/pull/197 diff --git a/README.md b/README.md index beae3e5..a48199c 100644 --- a/README.md +++ b/README.md @@ -52,9 +52,6 @@ defaults = [ {:ok, %Ch.Result{rows: [[0], [1], [2]]}} = Ch.query(pid, "SELECT * FROM system.numbers LIMIT 3") -{:ok, %Ch.Result{rows: [[0], [1], [2]]}} = - Ch.query(pid, "SELECT * FROM system.numbers LIMIT {$0:UInt8}", [3]) - {:ok, %Ch.Result{rows: [[0], [1], [2]]}} = Ch.query(pid, "SELECT * FROM system.numbers LIMIT {limit:UInt8}", %{"limit" => 3}) ``` @@ -75,9 +72,6 @@ Ch.query!(pid, "CREATE TABLE IF NOT EXISTS ch_demo(id UInt64) ENGINE Null") %Ch.Result{num_rows: 2} = Ch.query!(pid, "INSERT INTO ch_demo(id) VALUES (0), (1)") -%Ch.Result{num_rows: 2} = - Ch.query!(pid, "INSERT INTO ch_demo(id) VALUES ({$0:UInt8}), ({$1:UInt32})", [0, 1]) - %Ch.Result{num_rows: 2} = Ch.query!(pid, "INSERT INTO ch_demo(id) VALUES ({a:UInt16}), ({b:UInt64})", %{"a" => 0, "b" => 1}) @@ -85,36 +79,52 @@ Ch.query!(pid, "CREATE TABLE IF NOT EXISTS ch_demo(id UInt64) ENGINE Null") Ch.query!(pid, "INSERT INTO ch_demo(id) SELECT number FROM system.numbers LIMIT {limit:UInt8}", %{"limit" => 2}) ``` -#### Insert rows as [RowBinary](https://clickhouse.com/docs/en/interfaces/formats#rowbinary) (efficient) +#### Insert [RowBinary](https://clickhouse.com/docs/en/interfaces/formats#rowbinary) ```elixir {:ok, pid} = Ch.start_link() -Ch.query!(pid, "CREATE TABLE IF NOT EXISTS ch_demo(id UInt64) ENGINE Null") +Ch.query!(pid, "CREATE TABLE IF NOT EXISTS ch_demo(id UInt64, text String) ENGINE Null") -types = ["UInt64"] +rows = [ + [0, "a"], + [1, "b"] +] + +types = ["UInt64", "String"] # or -types = [Ch.Types.u64()] +types = [Ch.Types.u64(), Ch.Types.string()] # or -types = [:u64] +types = [:u64, :string] + +rowbinary = Ch.RowBinary.encode_rows(rows, types) %Ch.Result{num_rows: 2} = - Ch.query!(pid, "INSERT INTO ch_demo(id) FORMAT RowBinary", [[0], [1]], types: types) + Ch.query!(pid, ["INSERT INTO ch_demo(id) FORMAT RowBinary\n" | rowbinary]) ``` -Note that RowBinary format encoding requires `:types` option to be provided. - Similarly, you can use [`RowBinaryWithNamesAndTypes`](https://clickhouse.com/docs/en/interfaces/formats#rowbinarywithnamesandtypes) which would additionally do something like a type check. ```elixir -sql = "INSERT INTO ch_demo FORMAT RowBinaryWithNamesAndTypes" -opts = [names: ["id"], types: ["UInt64"]] -rows = [[0], [1]] +sql = "INSERT INTO ch_demo FORMAT RowBinaryWithNamesAndTypes\n" + +rows = [ + [0, "a"], + [1, "b"] +] -%Ch.Result{num_rows: 2} = Ch.query!(pid, sql, rows, opts) +types = ["UInt64", "String"] +names = ["id", "text"] + +data = [ + Ch.RowBinary.encode_names_and_types(names, types), + Ch.RowBinary.encode_rows(rows, types) +] + +%Ch.Result{num_rows: 2} = Ch.query!(pid, [sql | data]) ``` -#### Insert rows in custom [format](https://clickhouse.com/docs/en/interfaces/formats) +#### Insert rows in some other [format](https://clickhouse.com/docs/en/interfaces/formats) ```elixir {:ok, pid} = Ch.start_link() @@ -124,26 +134,27 @@ Ch.query!(pid, "CREATE TABLE IF NOT EXISTS ch_demo(id UInt64) ENGINE Null") csv = [0, 1] |> Enum.map(&to_string/1) |> Enum.intersperse(?\n) %Ch.Result{num_rows: 2} = - Ch.query!(pid, "INSERT INTO ch_demo(id) FORMAT CSV", csv, encode: false) + Ch.query!(pid, ["INSERT INTO ch_demo(id) FORMAT CSV\n" | csv]) ``` -#### Insert rows as chunked RowBinary stream +#### Insert [chunked](https://en.wikipedia.org/wiki/Chunked_transfer_encoding) RowBinary stream ```elixir {:ok, pid} = Ch.start_link() Ch.query!(pid, "CREATE TABLE IF NOT EXISTS ch_demo(id UInt64) ENGINE Null") -stream = Stream.repeatedly(fn -> [:rand.uniform(100)] end) -chunked = Stream.chunk_every(stream, 100) -encoded = Stream.map(chunked, fn chunk -> Ch.RowBinary.encode_rows(chunk, _types = ["UInt64"]) end) -ten_encoded_chunks = Stream.take(encoded, 10) - -%Ch.Result{num_rows: 1000} = - Ch.query(pid, "INSERT INTO ch_demo(id) FORMAT RowBinary", ten_encoded_chunks, encode: false) +DBConnection.run(pid, fn conn -> + Stream.repeatedly(fn -> [:rand.uniform(100)] end) + |> Stream.chunk_every(100_000) + |> Stream.map(fn chunk -> Ch.RowBinary.encode_rows(chunk, _types = ["UInt64"]) end) + |> Stream.take(10) + |> Stream.into(Ch.stream(conn, "INSERT INTO ch_demo(id) FORMAT RowBinary\n")) + |> Stream.run() +end) ``` -This query makes a [`transfer-encoding: chunked`](https://en.wikipedia.org/wiki/Chunked_transfer_encoding) HTTP request while unfolding the stream resulting in lower memory usage. +This query makes a [`transfer-encoding: chunked`] HTTP request while unfolding the stream resulting in lower memory usage. #### Query with custom [settings](https://clickhouse.com/docs/en/operations/settings/settings) @@ -156,7 +167,7 @@ settings = [async_insert: 1] Ch.query!(pid, "SHOW SETTINGS LIKE 'async_insert'") %Ch.Result{rows: [["async_insert", "Bool", "1"]]} = - Ch.query!(pid, "SHOW SETTINGS LIKE 'async_insert'", [], settings: settings) + Ch.query!(pid, "SHOW SETTINGS LIKE 'async_insert'", _params = [], settings: settings) ``` ## Caveats @@ -179,13 +190,13 @@ CREATE TABLE ch_nulls ( """) types = ["Nullable(UInt8)", "UInt8", "UInt8"] -inserted_rows = [[nil, nil, nil]] -selected_rows = [[nil, 0, 0]] +row = [nil, nil, nil] +rowbinary = Ch.RowBinary.encode_row(row, types) %Ch.Result{num_rows: 1} = - Ch.query!(pid, "INSERT INTO ch_nulls(a, b, c) FORMAT RowBinary", inserted_rows, types: types) + Ch.query!(pid, ["INSERT INTO ch_nulls(a, b, c) FORMAT RowBinary\n" | rowbinary]) -%Ch.Result{rows: ^selected_rows} = +%Ch.Result{rows: [[nil, _not_10 = 0, 0]]} = Ch.query!(pid, "SELECT * FROM ch_nulls") ``` @@ -197,12 +208,16 @@ However, [`input()`](https://clickhouse.com/docs/en/sql-reference/table-function sql = """ INSERT INTO ch_nulls SELECT * FROM input('a Nullable(UInt8), b Nullable(UInt8), c UInt8') - FORMAT RowBinary\ + FORMAT RowBinary """ -Ch.query!(pid, sql, inserted_rows, types: ["Nullable(UInt8)", "Nullable(UInt8)", "UInt8"]) +types = ["Nullable(UInt8)", "Nullable(UInt8)", "UInt8"] +rowbinary = Ch.RowBinary.encode_row(row, types) + +%Ch.Result{num_rows: 1} = + Ch.query!(pid, [sql | rowbinary]) -%Ch.Result{rows: [[0], [10]]} = +%Ch.Result{rows: [_before = [0], _after = [10]]} = Ch.query!(pid, "SELECT b FROM ch_nulls ORDER BY b") ``` @@ -215,26 +230,18 @@ When decoding [`String`](https://clickhouse.com/docs/en/sql-reference/data-types Ch.query!(pid, "CREATE TABLE ch_utf8(str String) ENGINE Memory") -bin = "\x61\xF0\x80\x80\x80b" -utf8 = "a�b" +rowbinary = Ch.RowBinary.encode(:string, "\x61\xF0\x80\x80\x80b") %Ch.Result{num_rows: 1} = - Ch.query!(pid, "INSERT INTO ch_utf8(str) FORMAT RowBinary", [[bin]], types: ["String"]) + Ch.query!(pid, ["INSERT INTO ch_utf8(str) FORMAT RowBinary\n" | rowbinary]) -%Ch.Result{rows: [[^utf8]]} = +%Ch.Result{rows: [["a�b"]]} = Ch.query!(pid, "SELECT * FROM ch_utf8") -%Ch.Result{rows: %{"data" => [[^utf8]]}} = +%Ch.Result{rows: %{"data" => [["a�b"]]}} = pid |> Ch.query!("SELECT * FROM ch_utf8 FORMAT JSONCompact") |> Map.update!(:rows, &Jason.decode!/1) ``` -To get raw binary from `String` columns use `:binary` type that skips UTF-8 checks. - -```elixir -%Ch.Result{rows: [[^bin]]} = - Ch.query!(pid, "SELECT * FROM ch_utf8", [], types: [:binary]) -``` - #### Timezones in RowBinary Decoding non-UTC datetimes like `DateTime('Asia/Taipei')` requires a [timezone database.](https://hexdocs.pm/elixir/DateTime.html#module-time-zone-database) @@ -268,124 +275,9 @@ utc = DateTime.utc_now() taipei = DateTime.shift_zone!(utc, "Asia/Taipei") # ** (ArgumentError) non-UTC timezones are not supported for encoding: 2023-04-26 01:49:43.044569+08:00 CST Asia/Taipei -Ch.query!(pid, "INSERT INTO ch_datetimes(datetime) FORMAT RowBinary", [[naive], [utc], [taipei]], types: ["DateTime"]) +Ch.RowBinary.encode_rows([[naive], [utc], [taipei]], ["DateTime"]) ``` ## Benchmarks -
-INSERT 1 million rows (original) - -

-$ MIX_ENV=bench mix run bench/insert.exs
-
-This benchmark is based on https://github.com/ClickHouse/clickhouse-go#benchmark
-
-Operating System: macOS
-CPU Information: Apple M1
-Number of Available Cores: 8
-Available memory: 8 GB
-Elixir 1.14.4
-Erlang 25.3
-
-Benchmark suite executing with the following configuration:
-warmup: 2 s
-time: 5 s
-memory time: 0 ns
-reduction time: 0 ns
-parallel: 1
-inputs: 1_000_000 rows
-Estimated total run time: 28 s
-
-Benchmarking encode with input 1_000_000 rows ...
-Benchmarking encode stream with input 1_000_000 rows ...
-Benchmarking insert with input 1_000_000 rows ...
-Benchmarking insert stream with input 1_000_000 rows ...
-
-##### With input 1_000_000 rows #####
-Name                    ips        average  deviation         median         99th %
-encode stream          1.63      612.96 ms    ±11.30%      583.03 ms      773.01 ms
-insert stream          1.22      819.82 ms     ±9.41%      798.94 ms      973.45 ms
-encode                 1.09      915.75 ms    ±44.13%      750.98 ms     1637.02 ms
-insert                 0.73     1373.84 ms    ±31.01%     1331.86 ms     1915.76 ms
-
-Comparison: 
-encode stream          1.63
-insert stream          1.22 - 1.34x slower +206.87 ms
-encode                 1.09 - 1.49x slower +302.79 ms
-insert                 0.73 - 2.24x slower +760.88 ms
-
- -
- -
-SELECT 500, 500 thousand, and 500 million rows (original) - -

-$ MIX_ENV=bench mix run bench/stream.exs
-
-This benchmark is based on https://github.com/ClickHouse/ch-bench
-
-Operating System: macOS
-CPU Information: Apple M1
-Number of Available Cores: 8
-Available memory: 8 GB
-Elixir 1.14.4
-Erlang 25.3
-
-Benchmark suite executing with the following configuration:
-warmup: 2 s
-time: 5 s
-memory time: 0 ns
-reduction time: 0 ns
-parallel: 1
-inputs: 500 rows, 500_000 rows, 500_000_000 rows
-Estimated total run time: 1.05 min
-
-Benchmarking stream with decode with input 500 rows ...
-Benchmarking stream with decode with input 500_000 rows ...
-Benchmarking stream with decode with input 500_000_000 rows ...
-Benchmarking stream with manual decode with input 500 rows ...
-Benchmarking stream with manual decode with input 500_000 rows ...
-Benchmarking stream with manual decode with input 500_000_000 rows ...
-Benchmarking stream without decode with input 500 rows ...
-Benchmarking stream without decode with input 500_000 rows ...
-Benchmarking stream without decode with input 500_000_000 rows ...
-
-##### With input 500 rows #####
-Name                                ips        average  deviation         median         99th %
-stream with decode               4.69 K      213.34 μs    ±12.49%      211.38 μs      290.94 μs
-stream with manual decode        4.69 K      213.43 μs    ±17.40%      210.96 μs      298.75 μs
-stream without decode            4.65 K      215.08 μs    ±10.79%      213.79 μs      284.66 μs
-
-Comparison:
-stream with decode               4.69 K
-stream with manual decode        4.69 K - 1.00x slower +0.0838 μs
-stream without decode            4.65 K - 1.01x slower +1.74 μs
-
-##### With input 500_000 rows #####
-Name                                ips        average  deviation         median         99th %
-stream without decode            234.58        4.26 ms    ±13.99%        4.04 ms        5.95 ms
-stream with manual decode         64.26       15.56 ms     ±8.36%       15.86 ms       17.97 ms
-stream with decode                41.03       24.37 ms     ±6.27%       24.39 ms       26.60 ms
-
-Comparison:
-stream without decode            234.58
-stream with manual decode         64.26 - 3.65x slower +11.30 ms
-stream with decode                41.03 - 5.72x slower +20.11 ms
-
-##### With input 500_000_000 rows #####
-Name                                ips        average  deviation         median         99th %
-stream without decode              0.32         3.17 s     ±0.20%         3.17 s         3.17 s
-stream with manual decode        0.0891        11.23 s     ±0.00%        11.23 s        11.23 s
-stream with decode               0.0462        21.66 s     ±0.00%        21.66 s        21.66 s
-
-Comparison:
-stream without decode              0.32
-stream with manual decode        0.0891 - 3.55x slower +8.06 s
-stream with decode               0.0462 - 6.84x slower +18.50 s
-
- -
- -[CI Results](https://github.com/plausible/ch/actions/workflows/bench.yml) (click the latest workflow run and scroll down to "Artifacts") +Please see [CI Results](https://github.com/plausible/ch/actions/workflows/bench.yml) (make sure to click the latest workflow run and scroll down to "Artifacts") for [some of our benchmarks.](./bench/) :) diff --git a/lib/ch.ex b/lib/ch.ex index ad4217f..dcc3002 100644 --- a/lib/ch.ex +++ b/lib/ch.ex @@ -14,7 +14,7 @@ defmodule Ch do | {:scheme, String.t()} | {:hostname, String.t()} | {:port, :inet.port_number()} - | {:transport_opts, :gen_tcp.connect_option()} + | {:transport_opts, :gen_tcp.connect_option() | :ssl.tls_client_option()} | DBConnection.start_option() @doc """ @@ -29,7 +29,7 @@ defmodule Ch do * `:database` - Database, defaults to `"default"` * `:username` - Username * `:password` - User password - * `:settings` - Keyword list of ClickHouse settings + * `:settings` - Keyword list of ClickHouse settings to send wtih every query * `:timeout` - HTTP receive timeout in milliseconds * `:transport_opts` - options to be given to the transport being used. See `Mint.HTTP1.connect/4` for more info * [`DBConnection.start_option()`](https://hexdocs.pm/db_connection/DBConnection.html#t:start_option/0) @@ -55,8 +55,6 @@ defmodule Ch do | {:command, Ch.Query.command()} | {:headers, [{String.t(), String.t()}]} | {:format, String.t()} - # TODO remove - | {:encode, boolean} | {:decode, boolean} | DBConnection.connection_option() @@ -69,8 +67,8 @@ defmodule Ch do * `:database` - Database * `:username` - Username * `:password` - User password - * `:settings` - Keyword list of settings - * `:timeout` - Query request timeout + * `:settings` - Keyword list of settings to merge with `:settings` from `start_link` and send with this query + * `:timeout` - Configures both query request timeout and HTTP receive timeout in milliseconds, whichever happens faster * `:command` - Command tag for the query * `:headers` - Custom HTTP headers for the request * `:format` - Custom response format for the request