Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Clickhouse string ordering and string filtering by UTF8 instead of bytes #6143

Open
wants to merge 31 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 14 commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
63b935c
Remove unnecessary concat function, and use UTF8 compatible functions…
casab Feb 9, 2023
1ea8d11
Merge remote-tracking branch 'upstream/master' into feature/clickhous…
casab Jun 7, 2024
49111c2
Add integration testing for ordering with collation
casab Jun 7, 2024
153b3fd
Merge branch 'master' into feature/clickhouse_utf8_filter_order
casab Nov 22, 2024
b2c4ee9
Move case sensitive entry from ClickHouseDbRunner.js to ClickHouseDbR…
casab Jan 24, 2025
dd2fd32
Added back CONCAT() to prevent sql injection
casab Jan 24, 2025
f26f4fe
Merge branch 'master' into feature/clickhouse_utf8_filter_order
KSDaemon Jan 24, 2025
a5b279e
Change intentation from 4 spaces to 2 spaces
casab Jan 24, 2025
f7a4759
Only apply coalition if field's type is string
casab Jan 24, 2025
83437e9
Merge branch 'master' into feature/clickhouse_utf8_filter_order
KSDaemon Jan 27, 2025
600c1fa
Replaced Google with Gork to make ordering clear, added toValidUTF8 t…
casab Jan 27, 2025
eadfa1a
Added CUBEJS_DB_CLICKHOUSE_SORT_COLLATION env variable for collation …
casab Jan 27, 2025
0610d16
Merge branch 'master' into feature/clickhouse_utf8_filter_order
casab Jan 28, 2025
8807178
Only use collation for clickhouse if env variable is set
casab Jan 28, 2025
f095335
Merge branch 'master' into feature/clickhouse_utf8_filter_order
casab Feb 4, 2025
e76b91a
Merge branch 'master' into feature/clickhouse_utf8_filter_order
casab Feb 5, 2025
fdebe6e
Merge branch 'master' into feature/clickhouse_utf8_filter_order
casab Feb 6, 2025
be6875b
Add CUBEJS_DB_CLICKHOUSE_USE_COLLATION env var to disable collation, …
casab Feb 6, 2025
4fc9ed4
Fix default clickhouseSortCollection value
casab Feb 6, 2025
b44e57e
Use ILIKE instead of LIKE with lowerUTF8 and toValidUTF8
casab Feb 6, 2025
a2616fe
Merge branch 'master' into feature/clickhouse_utf8_filter_order
KSDaemon Feb 6, 2025
ce3ec56
Fix clickhouseUseCollation tests to check for default value
casab Feb 6, 2025
ee06aee
Merge branch 'master' into feature/clickhouse_utf8_filter_order
KSDaemon Feb 6, 2025
b649df2
Fix clickhouse integration tests for the missing values
casab Feb 6, 2025
716f24b
Merge branch 'master' into feature/clickhouse_utf8_filter_order
KSDaemon Feb 6, 2025
79ab001
Assert collation in the 'collation in order by' clickhouse integratio…
casab Feb 6, 2025
f660674
Merge branch 'master' into feature/clickhouse_utf8_filter_order
casab Feb 11, 2025
beac449
Fix getFieldType by using hash.id instead
casab Feb 11, 2025
bd29d16
Merge branch 'master' into feature/clickhouse_utf8_filter_order
casab Feb 13, 2025
a62eb42
Merge branch 'master' into feature/clickhouse_utf8_filter_order
casab Feb 13, 2025
225c19a
Merge branch 'master' into feature/clickhouse_utf8_filter_order
casab Feb 14, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions packages/cubejs-backend-shared/src/env.ts
Original file line number Diff line number Diff line change
Expand Up @@ -1132,6 +1132,19 @@ const variables: Record<string, (...args: any) => any> = {
]
),

/**
* ClickHouse sort collation.
*/
clickhouseSortCollation: ({
dataSource
}: {
dataSource: string,
}) => (
process.env[
keyByDataSource('CUBEJS_DB_CLICKHOUSE_SORT_COLLATION', dataSource)
]
),

/** ****************************************************************
* ElasticSearch Driver *
***************************************************************** */
Expand Down
29 changes: 29 additions & 0 deletions packages/cubejs-backend-shared/test/db_env_multi.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -1539,6 +1539,35 @@ describe('Multiple datasources', () => {
);
});

test('getEnv("clickhouseSortCollation")', () => {
process.env.CUBEJS_DB_CLICKHOUSE_SORT_COLLATION = 'default1';
process.env.CUBEJS_DS_POSTGRES_DB_CLICKHOUSE_SORT_COLLATION = 'postgres1';
process.env.CUBEJS_DS_WRONG_DB_CLICKHOUSE_SORT_COLLATION = 'wrong1';
expect(getEnv('clickhouseSortCollation', { dataSource: 'default' })).toEqual('default1');
expect(getEnv('clickhouseSortCollation', { dataSource: 'postgres' })).toEqual('postgres1');
expect(() => getEnv('clickhouseSortCollation', { dataSource: 'wrong' })).toThrow(
'The wrong data source is missing in the declared CUBEJS_DATASOURCES.'
);

process.env.CUBEJS_DB_CLICKHOUSE_SORT_COLLATION = 'default2';
process.env.CUBEJS_DS_POSTGRES_DB_CLICKHOUSE_SORT_COLLATION = 'postgres2';
process.env.CUBEJS_DS_WRONG_DB_CLICKHOUSE_SORT_COLLATION = 'wrong2';
expect(getEnv('clickhouseSortCollation', { dataSource: 'default' })).toEqual('default2');
expect(getEnv('clickhouseSortCollation', { dataSource: 'postgres' })).toEqual('postgres2');
expect(() => getEnv('clickhouseSortCollation', { dataSource: 'wrong' })).toThrow(
'The wrong data source is missing in the declared CUBEJS_DATASOURCES.'
);

delete process.env.CUBEJS_DB_CLICKHOUSE_SORT_COLLATION;
delete process.env.CUBEJS_DS_POSTGRES_DB_CLICKHOUSE_SORT_COLLATION;
delete process.env.CUBEJS_DS_WRONG_DB_CLICKHOUSE_SORT_COLLATION;
expect(getEnv('clickhouseSortCollation', { dataSource: 'default' })).toBeUndefined();
expect(getEnv('clickhouseSortCollation', { dataSource: 'postgres' })).toBeUndefined();
expect(() => getEnv('clickhouseSortCollation', { dataSource: 'wrong' })).toThrow(
'The wrong data source is missing in the declared CUBEJS_DATASOURCES.'
);
});

test('getEnv("elasticApiId")', () => {
process.env.CUBEJS_DB_ELASTIC_APIKEY_ID = 'default1';
process.env.CUBEJS_DS_POSTGRES_DB_ELASTIC_APIKEY_ID = 'postgres1';
Expand Down
17 changes: 17 additions & 0 deletions packages/cubejs-backend-shared/test/db_env_single.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -975,6 +975,23 @@ describe('Single datasources', () => {
expect(getEnv('clickhouseReadOnly', { dataSource: 'wrong' })).toBeUndefined();
});

test('getEnv("clickhouseSortCollation")', () => {
process.env.CUBEJS_DB_CLICKHOUSE_SORT_COLLATION = 'default1';
expect(getEnv('clickhouseSortCollation', { dataSource: 'default' })).toEqual('default1');
expect(getEnv('clickhouseSortCollation', { dataSource: 'postgres' })).toEqual('default1');
expect(getEnv('clickhouseSortCollation', { dataSource: 'wrong' })).toEqual('default1');

process.env.CUBEJS_DB_CLICKHOUSE_SORT_COLLATION = 'default2';
expect(getEnv('clickhouseSortCollation', { dataSource: 'default' })).toEqual('default2');
expect(getEnv('clickhouseSortCollation', { dataSource: 'postgres' })).toEqual('default2');
expect(getEnv('clickhouseSortCollation', { dataSource: 'wrong' })).toEqual('default2');

delete process.env.CUBEJS_DB_CLICKHOUSE_SORT_COLLATION;
expect(getEnv('clickhouseSortCollation', { dataSource: 'default' })).toBeUndefined();
expect(getEnv('clickhouseSortCollation', { dataSource: 'postgres' })).toBeUndefined();
expect(getEnv('clickhouseSortCollation', { dataSource: 'wrong' })).toBeUndefined();
});

test('getEnv("elasticApiId")', () => {
process.env.CUBEJS_DB_ELASTIC_APIKEY_ID = 'default1';
expect(getEnv('elasticApiId', { dataSource: 'default' })).toEqual('default1');
Expand Down
59 changes: 52 additions & 7 deletions packages/cubejs-schema-compiler/src/adapter/ClickHouseQuery.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
import { parseSqlInterval } from '@cubejs-backend/shared';
import R from 'ramda';

import { getEnv, parseSqlInterval } from '@cubejs-backend/shared';
import { BaseQuery } from './BaseQuery';
import { BaseFilter } from './BaseFilter';
import { UserError } from '../compiler/UserError';
Expand All @@ -18,7 +20,7 @@ class ClickHouseFilter extends BaseFilter {
public likeIgnoreCase(column, not, param, type) {
const p = (!type || type === 'contains' || type === 'ends') ? '%' : '';
const s = (!type || type === 'contains' || type === 'starts') ? '%' : '';
return `lower(${column}) ${not ? 'NOT' : ''} LIKE CONCAT('${p}', lower(${this.allocateParam(param)}), '${s}')`;
return `lowerUTF8(toValidUTF8(${column})) ${not ? 'NOT' : ''} LIKE CONCAT('${p}', lowerUTF8(toValidUTF8(${this.allocateParam(param)})), '${s}')`;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why not just use ILIKE instead of toValidUTF8? I mean, it was written like this, but since you are messing around here - why not? Both lowerUTF8 and ILIKE can handle UTF-8, but ILIKE is a bit more direct, IMO.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am not sure if ILIKE is UTF-8 compatible. It might just be calling lower(). Let me ask this to the Clickhouse folk first.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is, for UTF-8 strings _ has a "single code point" meaning

https://clickhouse.com/docs/en/sql-reference/functions/string-search-functions#ilike

}

public castParameter() {
Expand Down Expand Up @@ -123,7 +125,7 @@ export class ClickHouseQuery extends BaseQuery {
.join(' AND ');
}

public getFieldAlias(id) {
public getField(id) {
const equalIgnoreCase = (a, b) => (
typeof a === 'string' && typeof b === 'string' && a.toUpperCase() === b.toUpperCase()
);
Expand All @@ -134,16 +136,30 @@ export class ClickHouseQuery extends BaseQuery {
d => equalIgnoreCase(d.dimension, id),
);

if (!field) {
field = this.measures.find(
d => equalIgnoreCase(d.measure, id) || equalIgnoreCase(d.expressionName, id),
);
}

return field;
}

public getFieldAlias(id) {
const field = this.getField(id);

if (field) {
return field.aliasName();
}

field = this.measures.find(
d => equalIgnoreCase(d.measure, id) || equalIgnoreCase(d.expressionName, id),
);
return null;
}

public getFieldType(id) {
const field = this.getField(id);

if (field) {
return field.aliasName();
return field.definition().type;
}

return null;
Expand All @@ -168,6 +184,35 @@ export class ClickHouseQuery extends BaseQuery {
return `${fieldAlias} ${direction}`;
}

public override orderBy() {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just to mention, it's not exactly clear to me how to fix it, so maybe comment with TODO/caveat would be enough

It's not enough to override orderBy, templates.expressions.order_by and templates.statements.select should be adjusted as well: they can be used by SQL API while generating complex queries, and it would not pick up changes here.

I think, expressions.order_by is not used at all at the moment, but, probably, it should. And even then, COLLATE can only be used with strings, so that would require plumbing type information into template.

Consider query like this coming to Cube's SQL API

SELECT
    "measure_count_distinct",
    COUNT(DISTINCT "dim_str0") AS "distinct_dimension_values"
FROM (
    SELECT
        COUNT(DISTINCT "countDistinct") AS "measure_count_distinct",
        CAST("dim_str0" AS TEXT) AS "dim_str0"
    FROM "MultiTypeCube"
-- this filter is important
    WHERE
        LOWER("dim_str1") = 'foo'
    GROUP BY
        2
) t
GROUP BY 1
ORDER BY 1

Inner query can (almost) be represented as regular query to Cube.
But because of filter it would be executed via SQL pushdown. And ORDER BY in query to ClickHouse will be generated around here:

let resulting_sql = generator
.get_sql_templates()
.select(
sql.sql.to_string(),
projection,
group_by,
group_descs,
aggregate,
// TODO
from_alias.unwrap_or("".to_string()),
if !filter.is_empty() {
Some(
filter
.iter()
.map(|f| f.expr.to_string())
.join(" AND "),
)
} else {
None
},
None,
order,
limit,
offset,
distinct,
)

This code path will not use orderBy methods, but will use templates, that can be overridden by adapter.

But there's more!
If there's no filter this whole query would execute as data load through Cube for inner query, and post-processing on SQL API side for outer, including sorting. This sorting would not be done by ClickHouse, so it would completely ignore collation setting, and, in some sense, may result in unexpected ordering change for seemingly.
This issue is not specific to ClickHouse, or collations. I just wanted to highlight that Cube already can have different orderings for same source DB and dataset, but different queries.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@mcheshkov As far as I can see expressions.order_by was not used. But expressions.sort is used. I have made changes to the templates. And tried to give data_type for the column in the expression from the Rust part. I would appreciate it if you let me know if I'm on the right path.

To be honest, I got a little bit overwhelmed by cubesql code. It took sometime reading the code to understand what you meant. I think I've got to understand in which cases SQL API pushes down queries, and how SQL API handles sorting.

//
// ClickHouse orders string by bytes, so we need to use COLLATE 'en' to order by string
//
if (R.isEmpty(this.order)) {
return '';
}

const collation = getEnv('clickhouseSortCollation');

const orderByString = R.pipe(
R.map((order) => {
let orderString = this.orderHashToString(order);
if (this.getFieldType(order) === 'string') && collation !== '' {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry for the mess, after thinking for a while, I agree with your original suggestion to fall back to en locale as it will work more correctly even for data in other locales...

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As written before it didn't allow removing COLLATE at all, and I think it's important to have a way to disable it. At the very least, to keep old behaviour for users, who for any reason does not expect this change.
I think we can enable it by default, but way to opt-out from using collations is a must: without opt-out there's no way to make ordering case-sensitive, and even if we want to force it, users would have only one way to fix this during migration - revert back to previous release and keep waiting.

I, personally, would not expect it to use collation by default, but I also find default collations in PostgreSQL or MySQL strange, so maybe it's just me.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Something I wrote when reviewing previous version:

Any collation in order by makes ordering case-insensitive in ClickHouse, and it would not match orderBy for any other DB adapter in Cube, at least from ops perspective.

Consider query like this:

SELECT
    s
FROM (
    SELECT
        'a' AS s
    UNION ALL
    SELECT
        'A' AS s
    UNION ALL
    SELECT
        'a' AS s
    UNION ALL
    SELECT
        'b' AS s
    UNION ALL
    SELECT
        'B' AS s
    UNION ALL
    SELECT
        'b' AS s
    ) t
ORDER BY
    s COLLATE 'en'
;

With COLLATE 'en' ClickHouse would return all a and A, and then all b and B.
But for DBMS like PostgreSQL this would depend on type of column (and encoding, for that matter), so admin of warehouse can decide how to order those, not admin of Cube, and they can decide whether it should or shouldn't be case sensitive.

Also keep in mind that Cube can issue ungrouped queries with order and limit, that would need to sort very large set of rows, so performance impact from enabling it can (I think) be huge in some specific cases, and specifically it can be unexpected for ClickHouse

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree on being able to disable collation behavior. We might document it to set the env variable to a specific value. Or I might add an another env variable.

The problem with Clickhouse is that it only allows collation in the order by clause. Nothing else, so there is no database level, table level, column level, session level collation setting anywhere. Otherwise I would agree.

orderString = `${orderString} COLLATE '${collation}'`;
}
return orderString;
}),
R.reject(R.isNil),
R.join(', ')
)(this.order);

if (!orderByString) {
return '';
}

return ` ORDER BY ${orderByString}`;
}

public groupByClause() {
if (this.ungrouped) {
return '';
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,8 @@ export class ClickHouseDbRunner extends BaseDbRunner {
(3, 300, '2017-01-05 16:00:00', '2017-01-19 16:00:00', 2, 'google', 120.120, 70.60),
(4, 400, '2017-01-06 16:00:00', '2017-01-24 16:00:00', 2, null, 120.120, 10.60),
(5, 500, '2017-01-06 16:00:00', '2017-01-24 16:00:00', 2, null, 120.120, 58.10),
(6, 500, '2016-09-06 16:00:00', '2016-09-06 16:00:00', 2, null, 120.120, 58.10)
(6, 500, '2016-09-06 16:00:00', '2016-09-06 16:00:00', 2, null, 120.120, 58.10),
(7, 300, '2017-01-07 16:00:00', '2017-01-25 16:00:00', 2, 'Gork', 120.120, 59.60)
` });

await clickHouse.command({ query: `
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,8 @@ describe('ClickHouse DataSchemaCompiler', () => {
{ visitors__created_at_day: '2017-01-02T00:00:00.000', visitors__visitor_count: '1' },
{ visitors__created_at_day: '2017-01-04T00:00:00.000', visitors__visitor_count: '1' },
{ visitors__created_at_day: '2017-01-05T00:00:00.000', visitors__visitor_count: '1' },
{ visitors__created_at_day: '2017-01-06T00:00:00.000', visitors__visitor_count: '2' }
{ visitors__created_at_day: '2017-01-06T00:00:00.000', visitors__visitor_count: '2' },
{ visitors__created_at_day: '2017-01-07T00:00:00.000', visitors__visitor_count: '1' }
]
);
});
Expand Down Expand Up @@ -229,7 +230,7 @@ describe('ClickHouse DataSchemaCompiler', () => {
expect(res).toEqual(
[
{ visitors__status: 'Approved', visitors__visitor_count: '2' },
{ visitors__status: 'Canceled', visitors__visitor_count: '4' }
{ visitors__status: 'Canceled', visitors__visitor_count: '5' }
]
);
});
Expand Down Expand Up @@ -299,6 +300,7 @@ describe('ClickHouse DataSchemaCompiler', () => {
expect(res).toEqual(
[
{ visitors__enabled_source: 'google', visitors__visitor_count: '1' },
{ visitors__enabled_source: 'Gork', visitors__visitor_count: '1' },
{ visitors__enabled_source: 'some', visitors__visitor_count: '2' },
{ visitors__enabled_source: null, visitors__visitor_count: '3' },
]
Expand Down Expand Up @@ -338,7 +340,8 @@ describe('ClickHouse DataSchemaCompiler', () => {
{ visitors__created_at: '2016-09-06T16:00:00.000' },
{ visitors__created_at: '2017-01-04T16:00:00.000' },
{ visitors__created_at: '2017-01-05T16:00:00.000' },
{ visitors__created_at: '2017-01-06T16:00:00.000' }
{ visitors__created_at: '2017-01-06T16:00:00.000' },
{ visitors__created_at: '2017-01-07T16:00:00.000' }
],
[{ visitors__created_at: '2017-01-06T16:00:00.000' }],
[
Expand All @@ -347,7 +350,10 @@ describe('ClickHouse DataSchemaCompiler', () => {
{ visitors__created_at: '2017-01-04T16:00:00.000' },
{ visitors__created_at: '2017-01-05T16:00:00.000' }
],
[{ visitors__created_at: '2017-01-06T16:00:00.000' }]
[
{ visitors__created_at: '2017-01-06T16:00:00.000' },
{ visitors__created_at: '2017-01-07T16:00:00.000' }
]
];
['in_date_range', 'not_in_date_range', 'on_the_date', 'before_date', 'after_date'].map((operator, index) => {
const filterValues = index < 2 ? ['2017-01-01', '2017-01-03'] : ['2017-01-06', '2017-01-06'];
Expand Down Expand Up @@ -377,6 +383,41 @@ describe('ClickHouse DataSchemaCompiler', () => {
return true;
});
}
it('collation in order by', async () => {
const { compiler, cubeEvaluator, joinGraph } = testPrepareCompiler(`
cube('visitors', {
sql: \`
select * from visitors
\`,

dimensions: {
source: {
type: 'string',
sql: 'source'
}
}
})
`);
await compiler.compile();

const query = new ClickHouseQuery({ joinGraph, cubeEvaluator, compiler }, {
measures: [],
dimensions: ['visitors.source'],
order: [{
id: 'visitors.source',
desc: false
}],
timezone: 'America/Los_Angeles'
});
logSqlAndParams(query);

const sqlAndParams = query.buildSqlAndParams();
const res = await dbRunner.testQuery(sqlAndParams);
const sql = sqlAndParams[0];
expect(sql).toMatch('ORDER BY `visitors__source` COLLATE \'en\'');

expect(res).toEqual([{ visitors__source: 'google' }, { visitors__source: 'Gork' }, { visitors__source: 'some' }]);
});

it('export import', () => {
const { compiler, cubeEvaluator, joinGraph } = prepareCompiler({
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -380,9 +380,9 @@ describe('ClickHouse JoinGraph', () => {
order: []
}, [{
visitors__visitor_revenue: '300',
visitors__visitor_count: '5',
visitors__visitor_count: '6',
visitor_checkins__visitor_checkins_count: '6',
visitors__per_visitor_revenue: '60'
visitors__per_visitor_revenue: '50'
}]));

// FAILS - need to finish query to override ::timestamptz
Expand Down Expand Up @@ -1222,6 +1222,7 @@ describe('ClickHouse JoinGraph', () => {
{ visitors__location: '120.12,40.6' },
{ visitors__location: '120.12,58.1' },
{ visitors__location: '120.12,58.6' },
{ visitors__location: '120.12,59.6' },
{ visitors__location: '120.12,70.6' }
]));

Expand Down