Skip to content

Commit

Permalink
[GLUTEN-5341][VL] Enable linear-regression.sql in GlutenSQLQueryTestS…
Browse files Browse the repository at this point in the history
…uite for Spark 3.5 (#5469)

Enable linear-regression.sql in GlutenSQLQueryTestSuite for Spark 3.5
  • Loading branch information
liujiayi771 authored Apr 22, 2024
1 parent b08258e commit 4d585fa
Show file tree
Hide file tree
Showing 3 changed files with 330 additions and 0 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
-- Test data.
CREATE OR REPLACE TEMPORARY VIEW testRegression AS SELECT * FROM VALUES
(1, 10, null), (2, 10, 11), (2, 20, 22), (2, 25, null), (2, 30, 35)
AS testRegression(k, y, x);

-- SPARK-37613: Support ANSI Aggregate Function: regr_count
SELECT regr_count(y, x) FROM testRegression;
SELECT regr_count(y, x) FROM testRegression WHERE x IS NOT NULL;
SELECT k, count(*), regr_count(y, x) FROM testRegression GROUP BY k;
SELECT k, count(*) FILTER (WHERE x IS NOT NULL), regr_count(y, x) FROM testRegression GROUP BY k;

-- SPARK-37613: Support ANSI Aggregate Function: regr_r2
SELECT regr_r2(y, x) FROM testRegression;
SELECT regr_r2(y, x) FROM testRegression WHERE x IS NOT NULL;
SELECT k, corr(y, x), regr_r2(y, x) FROM testRegression GROUP BY k;
SELECT k, corr(y, x) FILTER (WHERE x IS NOT NULL), regr_r2(y, x) FROM testRegression GROUP BY k;

-- SPARK-37614: Support ANSI Aggregate Function: regr_avgx & regr_avgy
SELECT regr_avgx(y, x), regr_avgy(y, x) FROM testRegression;
SELECT regr_avgx(y, x), regr_avgy(y, x) FROM testRegression WHERE x IS NOT NULL AND y IS NOT NULL;
SELECT k, avg(x), avg(y), regr_avgx(y, x), regr_avgy(y, x) FROM testRegression GROUP BY k;
SELECT k, avg(x) FILTER (WHERE x IS NOT NULL AND y IS NOT NULL), avg(y) FILTER (WHERE x IS NOT NULL AND y IS NOT NULL), regr_avgx(y, x), regr_avgy(y, x) FROM testRegression GROUP BY k;

-- SPARK-37672: Support ANSI Aggregate Function: regr_sxx
SELECT regr_sxx(y, x) FROM testRegression;
SELECT regr_sxx(y, x) FROM testRegression WHERE x IS NOT NULL AND y IS NOT NULL;
SELECT k, regr_sxx(y, x) FROM testRegression GROUP BY k;
SELECT k, regr_sxx(y, x) FROM testRegression WHERE x IS NOT NULL AND y IS NOT NULL GROUP BY k;

-- SPARK-37681: Support ANSI Aggregate Function: regr_sxy
SELECT regr_sxy(y, x) FROM testRegression;
SELECT regr_sxy(y, x) FROM testRegression WHERE x IS NOT NULL AND y IS NOT NULL;
SELECT k, regr_sxy(y, x) FROM testRegression GROUP BY k;
SELECT k, regr_sxy(y, x) FROM testRegression WHERE x IS NOT NULL AND y IS NOT NULL GROUP BY k;

-- SPARK-37702: Support ANSI Aggregate Function: regr_syy
SELECT regr_syy(y, x) FROM testRegression;
SELECT regr_syy(y, x) FROM testRegression WHERE x IS NOT NULL AND y IS NOT NULL;
SELECT k, regr_syy(y, x) FROM testRegression GROUP BY k;
SELECT k, regr_syy(y, x) FROM testRegression WHERE x IS NOT NULL AND y IS NOT NULL GROUP BY k;

-- SPARK-39230: Support ANSI Aggregate Function: regr_slope
SELECT regr_slope(y, x) FROM testRegression;
SELECT regr_slope(y, x) FROM testRegression WHERE x IS NOT NULL AND y IS NOT NULL;
SELECT k, regr_slope(y, x) FROM testRegression GROUP BY k;
SELECT k, regr_slope(y, x) FROM testRegression WHERE x IS NOT NULL AND y IS NOT NULL GROUP BY k;

-- SPARK-37623: Support ANSI Aggregate Function: regr_intercept
SELECT regr_intercept(y, x) FROM testRegression;
SELECT regr_intercept(y, x) FROM testRegression WHERE x IS NOT NULL AND y IS NOT NULL;
SELECT k, regr_intercept(y, x) FROM testRegression GROUP BY k;
SELECT k, regr_intercept(y, x) FROM testRegression WHERE x IS NOT NULL AND y IS NOT NULL GROUP BY k;
Original file line number Diff line number Diff line change
@@ -0,0 +1,276 @@
-- Automatically generated by SQLQueryTestSuite
-- !query
CREATE OR REPLACE TEMPORARY VIEW testRegression AS SELECT * FROM VALUES
(1, 10, null), (2, 10, 11), (2, 20, 22), (2, 25, null), (2, 30, 35)
AS testRegression(k, y, x)
-- !query schema
struct<>
-- !query output



-- !query
SELECT regr_count(y, x) FROM testRegression
-- !query schema
struct<regr_count(y, x):bigint>
-- !query output
3


-- !query
SELECT regr_count(y, x) FROM testRegression WHERE x IS NOT NULL
-- !query schema
struct<regr_count(y, x):bigint>
-- !query output
3


-- !query
SELECT k, count(*), regr_count(y, x) FROM testRegression GROUP BY k
-- !query schema
struct<k:int,count(1):bigint,regr_count(y, x):bigint>
-- !query output
1 1 0
2 4 3


-- !query
SELECT k, count(*) FILTER (WHERE x IS NOT NULL), regr_count(y, x) FROM testRegression GROUP BY k
-- !query schema
struct<k:int,count(1) FILTER (WHERE (x IS NOT NULL)):bigint,regr_count(y, x):bigint>
-- !query output
1 0 0
2 3 3


-- !query
SELECT regr_r2(y, x) FROM testRegression
-- !query schema
struct<regr_r2(y, x):double>
-- !query output
0.9976905311778291


-- !query
SELECT regr_r2(y, x) FROM testRegression WHERE x IS NOT NULL
-- !query schema
struct<regr_r2(y, x):double>
-- !query output
0.9976905311778291


-- !query
SELECT k, corr(y, x), regr_r2(y, x) FROM testRegression GROUP BY k
-- !query schema
struct<k:int,corr(y, x):double,regr_r2(y, x):double>
-- !query output
1 NULL NULL
2 0.9988445981121532 0.9976905311778291


-- !query
SELECT k, corr(y, x) FILTER (WHERE x IS NOT NULL), regr_r2(y, x) FROM testRegression GROUP BY k
-- !query schema
struct<k:int,corr(y, x) FILTER (WHERE (x IS NOT NULL)):double,regr_r2(y, x):double>
-- !query output
1 NULL NULL
2 0.9988445981121532 0.9976905311778291


-- !query
SELECT regr_avgx(y, x), regr_avgy(y, x) FROM testRegression
-- !query schema
struct<regr_avgx(y, x):double,regr_avgy(y, x):double>
-- !query output
22.666666666666668 20.0


-- !query
SELECT regr_avgx(y, x), regr_avgy(y, x) FROM testRegression WHERE x IS NOT NULL AND y IS NOT NULL
-- !query schema
struct<regr_avgx(y, x):double,regr_avgy(y, x):double>
-- !query output
22.666666666666668 20.0


-- !query
SELECT k, avg(x), avg(y), regr_avgx(y, x), regr_avgy(y, x) FROM testRegression GROUP BY k
-- !query schema
struct<k:int,avg(x):double,avg(y):double,regr_avgx(y, x):double,regr_avgy(y, x):double>
-- !query output
1 NULL 10.0 NULL NULL
2 22.666666666666668 21.25 22.666666666666668 20.0


-- !query
SELECT k, avg(x) FILTER (WHERE x IS NOT NULL AND y IS NOT NULL), avg(y) FILTER (WHERE x IS NOT NULL AND y IS NOT NULL), regr_avgx(y, x), regr_avgy(y, x) FROM testRegression GROUP BY k
-- !query schema
struct<k:int,avg(x) FILTER (WHERE ((x IS NOT NULL) AND (y IS NOT NULL))):double,avg(y) FILTER (WHERE ((x IS NOT NULL) AND (y IS NOT NULL))):double,regr_avgx(y, x):double,regr_avgy(y, x):double>
-- !query output
1 NULL NULL NULL NULL
2 22.666666666666668 20.0 22.666666666666668 20.0


-- !query
SELECT regr_sxx(y, x) FROM testRegression
-- !query schema
struct<regr_sxx(y, x):double>
-- !query output
288.66666666666663


-- !query
SELECT regr_sxx(y, x) FROM testRegression WHERE x IS NOT NULL AND y IS NOT NULL
-- !query schema
struct<regr_sxx(y, x):double>
-- !query output
288.66666666666663


-- !query
SELECT k, regr_sxx(y, x) FROM testRegression GROUP BY k
-- !query schema
struct<k:int,regr_sxx(y, x):double>
-- !query output
1 NULL
2 288.66666666666663


-- !query
SELECT k, regr_sxx(y, x) FROM testRegression WHERE x IS NOT NULL AND y IS NOT NULL GROUP BY k
-- !query schema
struct<k:int,regr_sxx(y, x):double>
-- !query output
2 288.66666666666663


-- !query
SELECT regr_sxy(y, x) FROM testRegression
-- !query schema
struct<regr_sxy(y, x):double>
-- !query output
240.0


-- !query
SELECT regr_sxy(y, x) FROM testRegression WHERE x IS NOT NULL AND y IS NOT NULL
-- !query schema
struct<regr_sxy(y, x):double>
-- !query output
240.0


-- !query
SELECT k, regr_sxy(y, x) FROM testRegression GROUP BY k
-- !query schema
struct<k:int,regr_sxy(y, x):double>
-- !query output
1 NULL
2 240.0


-- !query
SELECT k, regr_sxy(y, x) FROM testRegression WHERE x IS NOT NULL AND y IS NOT NULL GROUP BY k
-- !query schema
struct<k:int,regr_sxy(y, x):double>
-- !query output
2 240.0


-- !query
SELECT regr_syy(y, x) FROM testRegression
-- !query schema
struct<regr_syy(y, x):double>
-- !query output
200.0


-- !query
SELECT regr_syy(y, x) FROM testRegression WHERE x IS NOT NULL AND y IS NOT NULL
-- !query schema
struct<regr_syy(y, x):double>
-- !query output
200.0


-- !query
SELECT k, regr_syy(y, x) FROM testRegression GROUP BY k
-- !query schema
struct<k:int,regr_syy(y, x):double>
-- !query output
1 NULL
2 200.0


-- !query
SELECT k, regr_syy(y, x) FROM testRegression WHERE x IS NOT NULL AND y IS NOT NULL GROUP BY k
-- !query schema
struct<k:int,regr_syy(y, x):double>
-- !query output
2 200.0


-- !query
SELECT regr_slope(y, x) FROM testRegression
-- !query schema
struct<regr_slope(y, x):double>
-- !query output
0.8314087759815244


-- !query
SELECT regr_slope(y, x) FROM testRegression WHERE x IS NOT NULL AND y IS NOT NULL
-- !query schema
struct<regr_slope(y, x):double>
-- !query output
0.8314087759815244


-- !query
SELECT k, regr_slope(y, x) FROM testRegression GROUP BY k
-- !query schema
struct<k:int,regr_slope(y, x):double>
-- !query output
1 NULL
2 0.8314087759815244


-- !query
SELECT k, regr_slope(y, x) FROM testRegression WHERE x IS NOT NULL AND y IS NOT NULL GROUP BY k
-- !query schema
struct<k:int,regr_slope(y, x):double>
-- !query output
2 0.8314087759815244


-- !query
SELECT regr_intercept(y, x) FROM testRegression
-- !query schema
struct<regr_intercept(y, x):double>
-- !query output
1.1547344110854487


-- !query
SELECT regr_intercept(y, x) FROM testRegression WHERE x IS NOT NULL AND y IS NOT NULL
-- !query schema
struct<regr_intercept(y, x):double>
-- !query output
1.1547344110854487


-- !query
SELECT k, regr_intercept(y, x) FROM testRegression GROUP BY k
-- !query schema
struct<k:int,regr_intercept(y, x):double>
-- !query output
1 NULL
2 1.1547344110854487


-- !query
SELECT k, regr_intercept(y, x) FROM testRegression WHERE x IS NOT NULL AND y IS NOT NULL GROUP BY k
-- !query schema
struct<k:int,regr_intercept(y, x):double>
-- !query output
2 1.1547344110854487
Original file line number Diff line number Diff line change
Expand Up @@ -240,6 +240,8 @@ object VeloxSQLQueryTestSettings extends SQLQueryTestSettings {
// -- Aggregate with nulls.
"group-by.sql",
"udf/udf-group-by.sql",
// Overwrite some results of regr_intercept, regr_r2, corr.
"linear-regression.sql",
// Exception string doesn't match for
// SELECT (SELECT a FROM (SELECT 1 AS a UNION ALL SELECT 2 AS a) t) AS b
"subquery/scalar-subquery/scalar-subquery-select.sql"
Expand Down

0 comments on commit 4d585fa

Please sign in to comment.