Skip to content

Commit 9dc5aa1

Browse files
committed
add group-by udf tests
1 parent 971e832 commit 9dc5aa1

File tree

2 files changed

+672
-0
lines changed

2 files changed

+672
-0
lines changed
Lines changed: 160 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,160 @@
1+
-- This test file was converted from group-by.sql.
2+
-- Test data.
3+
CREATE OR REPLACE TEMPORARY VIEW testData AS SELECT * FROM VALUES
4+
(1, 1), (1, 2), (2, 1), (2, 2), (3, 1), (3, 2), (null, 1), (3, null), (null, null)
5+
AS testData(a, b);
6+
7+
-- Aggregate with empty GroupBy expressions.
8+
SELECT udf(a), udf(COUNT(b)) FROM testData;
9+
SELECT COUNT(udf(a)), udf(COUNT(b)) FROM testData;
10+
11+
-- Aggregate with non-empty GroupBy expressions.
12+
SELECT udf(a), COUNT(udf(b)) FROM testData GROUP BY a;
13+
SELECT udf(a), udf(COUNT(udf(b))) FROM testData GROUP BY b;
14+
SELECT COUNT(udf(a)), COUNT(udf(b)) FROM testData GROUP BY udf(a);
15+
16+
-- Aggregate grouped by literals.
17+
SELECT 'foo', COUNT(udf(a)) FROM testData GROUP BY 1;
18+
19+
-- Aggregate grouped by literals (whole stage code generation).
20+
SELECT 'foo' FROM testData WHERE a = 0 GROUP BY udf(1);
21+
22+
-- Aggregate grouped by literals (hash aggregate).
23+
SELECT 'foo', udf(APPROX_COUNT_DISTINCT(udf(a))) FROM testData WHERE a = 0 GROUP BY 1;
24+
25+
-- Aggregate grouped by literals (sort aggregate).
26+
SELECT 'foo', MAX(STRUCT(udf(a))) FROM testData WHERE a = 0 GROUP BY 1;
27+
28+
-- Aggregate with complex GroupBy expressions.
29+
SELECT udf(a + b), udf(COUNT(b)) FROM testData GROUP BY a + b;
30+
SELECT udf(a + 2), udf(COUNT(b)) FROM testData GROUP BY a + 1;
31+
32+
-- [SPARK-28445] Inconsistency between Scala and Python/Panda udfs when groupby with udf() is used
33+
-- The following query will make Scala UDF work, but Python and Pandas udfs will fail with an AnalysisException.
34+
-- The query should be added after SPARK-28445.
35+
-- SELECT udf(a + 1), udf(COUNT(b)) FROM testData GROUP BY udf(a + 1);
36+
37+
-- Aggregate with nulls.
38+
SELECT SKEWNESS(udf(a)), udf(KURTOSIS(a)), udf(MIN(a)), MAX(udf(a)), udf(AVG(udf(a))), udf(VARIANCE(a)), STDDEV(udf(a)), udf(SUM(a)), udf(COUNT(a))
39+
FROM testData;
40+
41+
-- Aggregate with foldable input and multiple distinct groups.
42+
SELECT COUNT(DISTINCT udf(b)), udf(COUNT(DISTINCT b, c)) FROM (SELECT 1 AS a, 2 AS b, 3 AS c) GROUP BY a;
43+
44+
-- Aliases in SELECT could be used in GROUP BY
45+
SELECT a AS k, COUNT(udf(b)) FROM testData GROUP BY k;
46+
SELECT a AS k, udf(COUNT(b)) FROM testData GROUP BY k HAVING k > 1;
47+
48+
-- Aggregate functions cannot be used in GROUP BY
49+
SELECT udf(COUNT(b)) AS k FROM testData GROUP BY k;
50+
51+
-- Test data.
52+
CREATE OR REPLACE TEMPORARY VIEW testDataHasSameNameWithAlias AS SELECT * FROM VALUES
53+
(1, 1, 3), (1, 2, 1) AS testDataHasSameNameWithAlias(k, a, v);
54+
SELECT k AS a, udf(COUNT(udf(v))) FROM testDataHasSameNameWithAlias GROUP BY a;
55+
56+
-- turn off group by aliases
57+
set spark.sql.groupByAliases=false;
58+
59+
-- Check analysis exceptions
60+
SELECT a AS k, udf(COUNT(udf(b))) FROM testData GROUP BY k;
61+
62+
-- Aggregate with empty input and non-empty GroupBy expressions.
63+
SELECT a, COUNT(udf(1)) FROM testData WHERE false GROUP BY a;
64+
65+
-- Aggregate with empty input and empty GroupBy expressions.
66+
SELECT udf(COUNT(1)) FROM testData WHERE false;
67+
SELECT 1 FROM (SELECT udf(COUNT(1)) FROM testData WHERE false) t;
68+
69+
-- Aggregate with empty GroupBy expressions and filter on top
70+
SELECT 1 from (
71+
SELECT 1 AS z,
72+
udf(MIN(a.x))
73+
FROM (select 1 as x) a
74+
WHERE false
75+
) b
76+
where b.z != b.z;
77+
78+
-- SPARK-24369 multiple distinct aggregations having the same argument set
79+
SELECT corr(DISTINCT x, y), udf(corr(DISTINCT y, x)), count(*)
80+
FROM (VALUES (1, 1), (2, 2), (2, 2)) t(x, y);
81+
82+
-- SPARK-25708 HAVING without GROUP BY means global aggregate
83+
SELECT udf(1) FROM range(10) HAVING true;
84+
85+
SELECT udf(udf(1)) FROM range(10) HAVING MAX(id) > 0;
86+
87+
SELECT udf(id) FROM range(10) HAVING id > 0;
88+
89+
-- Test data
90+
CREATE OR REPLACE TEMPORARY VIEW test_agg AS SELECT * FROM VALUES
91+
(1, true), (1, false),
92+
(2, true),
93+
(3, false), (3, null),
94+
(4, null), (4, null),
95+
(5, null), (5, true), (5, false) AS test_agg(k, v);
96+
97+
-- empty table
98+
SELECT udf(every(v)), udf(some(v)), any(v) FROM test_agg WHERE 1 = 0;
99+
100+
-- all null values
101+
SELECT udf(every(udf(v))), some(v), any(v) FROM test_agg WHERE k = 4;
102+
103+
-- aggregates are null Filtering
104+
SELECT every(v), udf(some(v)), any(v) FROM test_agg WHERE k = 5;
105+
106+
-- group by
107+
SELECT k, every(v), udf(some(v)), any(v) FROM test_agg GROUP BY k;
108+
109+
-- having
110+
SELECT udf(k), every(v) FROM test_agg GROUP BY k HAVING every(v) = false;
111+
SELECT k, udf(every(v)) FROM test_agg GROUP BY k HAVING every(v) IS NULL;
112+
113+
-- basic subquery path to make sure rewrite happens in both parent and child plans.
114+
SELECT k,
115+
udf(Every(v)) AS every
116+
FROM test_agg
117+
WHERE k = 2
118+
AND v IN (SELECT Any(v)
119+
FROM test_agg
120+
WHERE k = 1)
121+
GROUP BY k;
122+
123+
-- basic subquery path to make sure rewrite happens in both parent and child plans.
124+
SELECT udf(udf(k)),
125+
Every(v) AS every
126+
FROM test_agg
127+
WHERE k = 2
128+
AND v IN (SELECT Every(v)
129+
FROM test_agg
130+
WHERE k = 1)
131+
GROUP BY k;
132+
133+
-- input type checking Int
134+
SELECT every(udf(1));
135+
136+
-- input type checking Short
137+
SELECT some(udf(1S));
138+
139+
-- input type checking Long
140+
SELECT any(udf(1L));
141+
142+
-- input type checking String
143+
SELECT udf(every("true"));
144+
145+
-- every/some/any aggregates are supported as windows expression.
146+
SELECT k, v, every(v) OVER (PARTITION BY k ORDER BY v) FROM test_agg;
147+
SELECT k, udf(udf(v)), some(v) OVER (PARTITION BY k ORDER BY v) FROM test_agg;
148+
SELECT udf(udf(k)), v, any(v) OVER (PARTITION BY k ORDER BY v) FROM test_agg;
149+
150+
-- Having referencing aggregate expressions is ok.
151+
SELECT udf(count(*)) FROM test_agg HAVING count(*) > 1L;
152+
SELECT k, udf(max(v)) FROM test_agg GROUP BY k HAVING max(v) = true;
153+
154+
-- Aggrgate expressions can be referenced through an alias
155+
SELECT * FROM (SELECT udf(COUNT(*)) AS cnt FROM test_agg) WHERE cnt > 1L;
156+
157+
-- Error when aggregate expressions are in where clause directly
158+
SELECT udf(count(*)) FROM test_agg WHERE count(*) > 1L;
159+
SELECT udf(count(*)) FROM test_agg WHERE count(*) + 1L > 1L;
160+
SELECT udf(count(*)) FROM test_agg WHERE k = 1 or k = 2 or count(*) + 1L > 1L or max(k) > 1;

0 commit comments

Comments
 (0)