|
| 1 | +-- This test file was converted from group-by.sql. |
| 2 | +-- Test data. |
| 3 | +CREATE OR REPLACE TEMPORARY VIEW testData AS SELECT * FROM VALUES |
| 4 | +(1, 1), (1, 2), (2, 1), (2, 2), (3, 1), (3, 2), (null, 1), (3, null), (null, null) |
| 5 | +AS testData(a, b); |
| 6 | + |
| 7 | +-- Aggregate with empty GroupBy expressions. |
| 8 | +SELECT udf(a), udf(COUNT(b)) FROM testData; |
| 9 | +SELECT COUNT(udf(a)), udf(COUNT(b)) FROM testData; |
| 10 | + |
| 11 | +-- Aggregate with non-empty GroupBy expressions. |
| 12 | +SELECT udf(a), COUNT(udf(b)) FROM testData GROUP BY a; |
| 13 | +SELECT udf(a), udf(COUNT(udf(b))) FROM testData GROUP BY b; |
| 14 | +SELECT COUNT(udf(a)), COUNT(udf(b)) FROM testData GROUP BY udf(a); |
| 15 | + |
| 16 | +-- Aggregate grouped by literals. |
| 17 | +SELECT 'foo', COUNT(udf(a)) FROM testData GROUP BY 1; |
| 18 | + |
| 19 | +-- Aggregate grouped by literals (whole stage code generation). |
| 20 | +SELECT 'foo' FROM testData WHERE a = 0 GROUP BY udf(1); |
| 21 | + |
| 22 | +-- Aggregate grouped by literals (hash aggregate). |
| 23 | +SELECT 'foo', udf(APPROX_COUNT_DISTINCT(udf(a))) FROM testData WHERE a = 0 GROUP BY 1; |
| 24 | + |
| 25 | +-- Aggregate grouped by literals (sort aggregate). |
| 26 | +SELECT 'foo', MAX(STRUCT(udf(a))) FROM testData WHERE a = 0 GROUP BY 1; |
| 27 | + |
| 28 | +-- Aggregate with complex GroupBy expressions. |
| 29 | +SELECT udf(a + b), udf(COUNT(b)) FROM testData GROUP BY a + b; |
| 30 | +SELECT udf(a + 2), udf(COUNT(b)) FROM testData GROUP BY a + 1; |
| 31 | + |
| 32 | +-- [SPARK-28445] Inconsistency between Scala and Python/Panda udfs when groupby with udf() is used |
| 33 | +-- The following query will make Scala UDF work, but Python and Pandas udfs will fail with an AnalysisException. |
| 34 | +-- The query should be added after SPARK-28445. |
| 35 | +-- SELECT udf(a + 1), udf(COUNT(b)) FROM testData GROUP BY udf(a + 1); |
| 36 | + |
| 37 | +-- Aggregate with nulls. |
| 38 | +SELECT SKEWNESS(udf(a)), udf(KURTOSIS(a)), udf(MIN(a)), MAX(udf(a)), udf(AVG(udf(a))), udf(VARIANCE(a)), STDDEV(udf(a)), udf(SUM(a)), udf(COUNT(a)) |
| 39 | +FROM testData; |
| 40 | + |
| 41 | +-- Aggregate with foldable input and multiple distinct groups. |
| 42 | +SELECT COUNT(DISTINCT udf(b)), udf(COUNT(DISTINCT b, c)) FROM (SELECT 1 AS a, 2 AS b, 3 AS c) GROUP BY a; |
| 43 | + |
| 44 | +-- Aliases in SELECT could be used in GROUP BY |
| 45 | +SELECT a AS k, COUNT(udf(b)) FROM testData GROUP BY k; |
| 46 | +SELECT a AS k, udf(COUNT(b)) FROM testData GROUP BY k HAVING k > 1; |
| 47 | + |
| 48 | +-- Aggregate functions cannot be used in GROUP BY |
| 49 | +SELECT udf(COUNT(b)) AS k FROM testData GROUP BY k; |
| 50 | + |
| 51 | +-- Test data. |
| 52 | +CREATE OR REPLACE TEMPORARY VIEW testDataHasSameNameWithAlias AS SELECT * FROM VALUES |
| 53 | +(1, 1, 3), (1, 2, 1) AS testDataHasSameNameWithAlias(k, a, v); |
| 54 | +SELECT k AS a, udf(COUNT(udf(v))) FROM testDataHasSameNameWithAlias GROUP BY a; |
| 55 | + |
| 56 | +-- turn off group by aliases |
| 57 | +set spark.sql.groupByAliases=false; |
| 58 | + |
| 59 | +-- Check analysis exceptions |
| 60 | +SELECT a AS k, udf(COUNT(udf(b))) FROM testData GROUP BY k; |
| 61 | + |
| 62 | +-- Aggregate with empty input and non-empty GroupBy expressions. |
| 63 | +SELECT a, COUNT(udf(1)) FROM testData WHERE false GROUP BY a; |
| 64 | + |
| 65 | +-- Aggregate with empty input and empty GroupBy expressions. |
| 66 | +SELECT udf(COUNT(1)) FROM testData WHERE false; |
| 67 | +SELECT 1 FROM (SELECT udf(COUNT(1)) FROM testData WHERE false) t; |
| 68 | + |
| 69 | +-- Aggregate with empty GroupBy expressions and filter on top |
| 70 | +SELECT 1 from ( |
| 71 | + SELECT 1 AS z, |
| 72 | + udf(MIN(a.x)) |
| 73 | + FROM (select 1 as x) a |
| 74 | + WHERE false |
| 75 | +) b |
| 76 | +where b.z != b.z; |
| 77 | + |
| 78 | +-- SPARK-24369 multiple distinct aggregations having the same argument set |
| 79 | +SELECT corr(DISTINCT x, y), udf(corr(DISTINCT y, x)), count(*) |
| 80 | + FROM (VALUES (1, 1), (2, 2), (2, 2)) t(x, y); |
| 81 | + |
| 82 | +-- SPARK-25708 HAVING without GROUP BY means global aggregate |
| 83 | +SELECT udf(1) FROM range(10) HAVING true; |
| 84 | + |
| 85 | +SELECT udf(udf(1)) FROM range(10) HAVING MAX(id) > 0; |
| 86 | + |
| 87 | +SELECT udf(id) FROM range(10) HAVING id > 0; |
| 88 | + |
| 89 | +-- Test data |
| 90 | +CREATE OR REPLACE TEMPORARY VIEW test_agg AS SELECT * FROM VALUES |
| 91 | + (1, true), (1, false), |
| 92 | + (2, true), |
| 93 | + (3, false), (3, null), |
| 94 | + (4, null), (4, null), |
| 95 | + (5, null), (5, true), (5, false) AS test_agg(k, v); |
| 96 | + |
| 97 | +-- empty table |
| 98 | +SELECT udf(every(v)), udf(some(v)), any(v) FROM test_agg WHERE 1 = 0; |
| 99 | + |
| 100 | +-- all null values |
| 101 | +SELECT udf(every(udf(v))), some(v), any(v) FROM test_agg WHERE k = 4; |
| 102 | + |
| 103 | +-- aggregates are null Filtering |
| 104 | +SELECT every(v), udf(some(v)), any(v) FROM test_agg WHERE k = 5; |
| 105 | + |
| 106 | +-- group by |
| 107 | +SELECT k, every(v), udf(some(v)), any(v) FROM test_agg GROUP BY k; |
| 108 | + |
| 109 | +-- having |
| 110 | +SELECT udf(k), every(v) FROM test_agg GROUP BY k HAVING every(v) = false; |
| 111 | +SELECT k, udf(every(v)) FROM test_agg GROUP BY k HAVING every(v) IS NULL; |
| 112 | + |
| 113 | +-- basic subquery path to make sure rewrite happens in both parent and child plans. |
| 114 | +SELECT k, |
| 115 | + udf(Every(v)) AS every |
| 116 | +FROM test_agg |
| 117 | +WHERE k = 2 |
| 118 | + AND v IN (SELECT Any(v) |
| 119 | + FROM test_agg |
| 120 | + WHERE k = 1) |
| 121 | +GROUP BY k; |
| 122 | + |
| 123 | +-- basic subquery path to make sure rewrite happens in both parent and child plans. |
| 124 | +SELECT udf(udf(k)), |
| 125 | + Every(v) AS every |
| 126 | +FROM test_agg |
| 127 | +WHERE k = 2 |
| 128 | + AND v IN (SELECT Every(v) |
| 129 | + FROM test_agg |
| 130 | + WHERE k = 1) |
| 131 | +GROUP BY k; |
| 132 | + |
| 133 | +-- input type checking Int |
| 134 | +SELECT every(udf(1)); |
| 135 | + |
| 136 | +-- input type checking Short |
| 137 | +SELECT some(udf(1S)); |
| 138 | + |
| 139 | +-- input type checking Long |
| 140 | +SELECT any(udf(1L)); |
| 141 | + |
| 142 | +-- input type checking String |
| 143 | +SELECT udf(every("true")); |
| 144 | + |
| 145 | +-- every/some/any aggregates are supported as windows expression. |
| 146 | +SELECT k, v, every(v) OVER (PARTITION BY k ORDER BY v) FROM test_agg; |
| 147 | +SELECT k, udf(udf(v)), some(v) OVER (PARTITION BY k ORDER BY v) FROM test_agg; |
| 148 | +SELECT udf(udf(k)), v, any(v) OVER (PARTITION BY k ORDER BY v) FROM test_agg; |
| 149 | + |
| 150 | +-- Having referencing aggregate expressions is ok. |
| 151 | +SELECT udf(count(*)) FROM test_agg HAVING count(*) > 1L; |
| 152 | +SELECT k, udf(max(v)) FROM test_agg GROUP BY k HAVING max(v) = true; |
| 153 | + |
| 154 | +-- Aggrgate expressions can be referenced through an alias |
| 155 | +SELECT * FROM (SELECT udf(COUNT(*)) AS cnt FROM test_agg) WHERE cnt > 1L; |
| 156 | + |
| 157 | +-- Error when aggregate expressions are in where clause directly |
| 158 | +SELECT udf(count(*)) FROM test_agg WHERE count(*) > 1L; |
| 159 | +SELECT udf(count(*)) FROM test_agg WHERE count(*) + 1L > 1L; |
| 160 | +SELECT udf(count(*)) FROM test_agg WHERE k = 1 or k = 2 or count(*) + 1L > 1L or max(k) > 1; |
0 commit comments