Skip to content

Commit 1672cc6

Browse files
Backport #79008 to 25.3: Fix hashing of NULLs, attempt 3
1 parent 3c97c1d commit 1672cc6

File tree

5 files changed

+289
-2
lines changed

5 files changed

+289
-2
lines changed

docs/en/sql-reference/functions/hash-functions.md

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,19 @@ Hash functions can be used for the deterministic pseudo-random shuffling of elem
1212

1313
Simhash is a hash function, which returns close hash values for close (similar) arguments.
1414

15+
Most hash functions accept any number of arguments of any types.
16+
17+
:::note
18+
Hash of NULL is NULL. To get a non-NULL hash of a Nullable column, wrap it in a tuple:
19+
```sql
20+
SELECT cityHash64(tuple(NULL))
21+
```
22+
:::
23+
24+
:::note
25+
To calculate hash of the whole contents of a table, use `sum(cityHash64(tuple(*)))` (or other hash function). `tuple` ensures that rows with NULL values are not skipped. `sum` ensures that the order of rows doesn't matter.
26+
:::
27+
1528
## halfMD5 {#halfmd5}
1629

1730
[Interprets](/sql-reference/functions/type-conversion-functions#reinterpretasstring) all the input parameters as strings and calculates the [MD5](https://en.wikipedia.org/wiki/MD5) hash value for each of them. Then combines hashes, takes the first 8 bytes of the hash of the resulting string, and interprets them as `UInt64` in big-endian byte order.
@@ -1962,7 +1975,7 @@ Calculates Keccak-256 hash string and returns the resulting set of bytes as [Fix
19621975
keccak256('s')
19631976
```
19641977

1965-
This cryptographic hash-function is used a lot in [EVM-based blockchains](https://ethereum.github.io/yellowpaper/paper.pdf).
1978+
This cryptographic hash-function is used a lot in [EVM-based blockchains](https://ethereum.github.io/yellowpaper/paper.pdf).
19661979

19671980
**Arguments**
19681981

src/Functions/FunctionsHashing.h

Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,13 +33,15 @@
3333
#include <DataTypes/DataTypeEnum.h>
3434
#include <DataTypes/DataTypeTuple.h>
3535
#include <DataTypes/DataTypeMap.h>
36+
#include <DataTypes/DataTypeNullable.h>
3637
#include <Columns/ColumnsNumber.h>
3738
#include <Columns/ColumnString.h>
3839
#include <Columns/ColumnConst.h>
3940
#include <Columns/ColumnFixedString.h>
4041
#include <Columns/ColumnArray.h>
4142
#include <Columns/ColumnTuple.h>
4243
#include <Columns/ColumnMap.h>
44+
#include <Columns/ColumnNullable.h>
4345
#include <Functions/IFunction.h>
4446
#include <Functions/FunctionHelpers.h>
4547
#include <Functions/PerformanceAdaptors.h>
@@ -850,6 +852,10 @@ class FunctionAnyHash : public IFunction
850852
{
851853
public:
852854
static constexpr auto name = Impl::name;
855+
static constexpr UInt128 NULL_HASH = UInt128({0xc58ad2da03d9a871ul, 0x5715f196cbea7a40ul});
856+
857+
/// Keep default implementation of useDefaultImplementationForNulls for compatibility.
858+
/// E.g. someHash(NULL) is NULL, but someHash(tuple(NULL)) is not NULL.
853859

854860
private:
855861
using ToType = typename Impl::ReturnType;
@@ -1161,6 +1167,18 @@ class FunctionAnyHash : public IFunction
11611167
key = Impl::getKey(key_cols, i);
11621168
ColumnArray::Offset next_offset = offsets[i];
11631169

1170+
/// There are two bugs here, affecting hashes of empty arrays:
1171+
/// 1. If ToType is UInt128, we produce a 32-bit hash,
1172+
/// 2. `hash` doesn't depend on key.
1173+
///
1174+
/// SELECT reinterpret(sipHash128Keyed((number, number), []), 'UInt128') FROM numbers(2)
1175+
///
1176+
/// ┌─reinterpret(⋯ 'UInt128')─┐
1177+
/// 1. │ 4249604106 │
1178+
/// 2. │ 4249604106 │
1179+
/// └──────────────────────────┘
1180+
///
1181+
/// There's no way to fix this without breaking compatibility.
11641182
ToType hash;
11651183
if constexpr (std::is_same_v<ToType, UInt64>)
11661184
hash = IntHash64Impl::apply(next_offset - current_offset);
@@ -1189,6 +1207,74 @@ class FunctionAnyHash : public IFunction
11891207
column->getName(), getName());
11901208
}
11911209

1210+
template <bool first>
1211+
void executeNothing(const KeyColumnsType &, const IColumn *, typename ColumnVector<ToType>::Container & vec_to) const
1212+
{
1213+
/// This value shouldn't affect anything, it should only appear inside null Nullable or
1214+
/// empty Array, where the caller will ignore this and assign their own hash value.
1215+
/// Fill it with zeroes just in case, to avoid leaking memory contents if something's broken.
1216+
vec_to.assign(vec_to.size(), ToType(0));
1217+
}
1218+
1219+
template <bool first>
1220+
void executeNullable(const KeyColumnsType & key_cols, const IDataType * from_type, const IColumn * column, typename ColumnVector<ToType>::Container & vec_to) const
1221+
{
1222+
if (const auto * col_from = checkAndGetColumn<ColumnNullable>(column))
1223+
{
1224+
const auto * nested_type = assert_cast<const DataTypeNullable &>(*from_type).getNestedType().get();
1225+
const auto & nested_col = col_from->getNestedColumn();
1226+
1227+
KeyType key {};
1228+
ToType null_hash;
1229+
if constexpr (Keyed)
1230+
{
1231+
/// Make the hash depend on key.
1232+
key = Impl::getKey(key_cols, 0);
1233+
null_hash = combineHashes(key, static_cast<ToType>(NULL_HASH), 0);
1234+
}
1235+
else
1236+
{
1237+
null_hash = static_cast<ToType>(NULL_HASH);
1238+
}
1239+
1240+
typename ColumnVector<ToType>::Container original_vec_to;
1241+
if (!first)
1242+
original_vec_to.assign(vec_to);
1243+
1244+
/// Make sure non-null values are hashed exactly as if the type were non-Nullable.
1245+
bool is_first = first;
1246+
executeForArgument(key_cols, nested_type, &nested_col, vec_to, is_first);
1247+
1248+
for (size_t i = 0; i < vec_to.size(); ++i)
1249+
{
1250+
if (!col_from->isNullAt(i))
1251+
continue;
1252+
1253+
if constexpr (Keyed)
1254+
{
1255+
if (!key_cols.is_const && i != 0)
1256+
{
1257+
key = Impl::getKey(key_cols, i);
1258+
null_hash = combineHashes(key, static_cast<ToType>(NULL_HASH), 0);
1259+
}
1260+
}
1261+
1262+
if constexpr (first)
1263+
vec_to[i] = null_hash;
1264+
else
1265+
vec_to[i] = combineHashes(key, original_vec_to[i], null_hash);
1266+
}
1267+
}
1268+
else if (const ColumnConst * col_from_const = checkAndGetColumnConst<ColumnNullable>(column))
1269+
{
1270+
ColumnPtr full_column = col_from_const->convertToFullColumn();
1271+
executeNullable<first>(key_cols, from_type, full_column.get(), vec_to);
1272+
}
1273+
else
1274+
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column {} of first argument of function {}",
1275+
column->getName(), getName());
1276+
}
1277+
11921278
template <bool first>
11931279
void executeAny(const KeyColumnsType & key_cols, const IDataType * from_type, const IColumn * icolumn, typename ColumnVector<ToType>::Container & vec_to) const
11941280
{
@@ -1232,6 +1318,8 @@ class FunctionAnyHash : public IFunction
12321318
else if (which.isString()) executeString<first>(key_cols, icolumn, vec_to);
12331319
else if (which.isFixedString()) executeString<first>(key_cols, icolumn, vec_to);
12341320
else if (which.isArray()) executeArray<first>(key_cols, from_type, icolumn, vec_to);
1321+
else if (which.isNothing()) executeNothing<first>(key_cols, icolumn, vec_to);
1322+
else if (which.isNullable()) executeNullable<first>(key_cols, from_type, icolumn, vec_to);
12351323
else executeGeneric<first>(key_cols, icolumn, vec_to);
12361324
}
12371325

tests/queries/0_stateless/02534_keyed_siphash.sql

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -336,7 +336,7 @@ SELECT 'Check asan bug';
336336
SELECT sipHash128((toUInt64(9223372036854775806), 1)) = sipHash128(1) GROUP BY sipHash128(1::UInt8), toUInt64(9223372036854775806);
337337

338338
SELECT 'Check bug found fuzzing';
339-
SELECT [(255, 1048575)], sipHash128ReferenceKeyed((toUInt64(2147483646), toUInt64(9223372036854775807)), ([(NULL, 100), (NULL, NULL), (1024, 10)], toUInt64(2), toUInt64(1024)), ''), hex(sipHash128ReferenceKeyed((-9223372036854775807, 1.), '-1', NULL)), ('', toUInt64(65535), [(9223372036854775807, 9223372036854775806)], toUInt64(65536)), arrayJoin((NULL, 65537, 255), [(NULL, NULL)]) GROUP BY tupleElement((NULL, NULL, NULL, -1), toUInt64(2), 2) = NULL; -- { serverError NOT_IMPLEMENTED }
339+
SELECT [(255, 1048575)], sipHash128ReferenceKeyed((toUInt64(2147483646), toUInt64(9223372036854775807)), ([(NULL, 100), (NULL, NULL), (1024, 10)], toUInt64(2), toUInt64(1024)), ''), hex(sipHash128ReferenceKeyed((-9223372036854775807, 1.), '-1', NULL)), ('', toUInt64(65535), [(9223372036854775807, 9223372036854775806)], toUInt64(65536)), arrayJoin((NULL, 65537, 255), [(NULL, NULL)]) GROUP BY tupleElement((NULL, NULL, NULL, -1), toUInt64(2), 2) = NULL SETTINGS enable_analyzer=1; -- { serverError NUMBER_OF_ARGUMENTS_DOESNT_MATCH }
340340
SELECT hex(sipHash128ReferenceKeyed((0::UInt64, 0::UInt64), ([1, 1])));
341341

342342
SELECT 'Test arrays and maps';
Lines changed: 125 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,125 @@
1+
-- { echoOn }
2+
3+
select xxHash32(null);
4+
\N
5+
select xxHash64(null);
6+
\N
7+
select xxHash64([]);
8+
4761183170873013810
9+
select xxHash64([null]);
10+
7883836132540916187
11+
select xxHash64([null, null]);
12+
4782716432907020049
13+
select xxHash64([null::Nullable(Int64)]);
14+
7883836132540916187
15+
select xxHash64([null::Nullable(String)]);
16+
7883836132540916187
17+
select xxHash64(tuple());
18+
16324913028386710556
19+
select xxHash64(tuple(null));
20+
14234421406067173489
21+
select xxHash64(tuple(null, null));
22+
16006964495223566989
23+
select xxHash64(tuple(null::Nullable(Int64)));
24+
14234421406067173489
25+
select xxHash64(tuple(null::Nullable(String)));
26+
14234421406067173489
27+
select xxHash32(materialize(null));
28+
\N
29+
select xxHash64(materialize(null));
30+
\N
31+
select xxHash64(materialize([]));
32+
4761183170873013810
33+
select xxHash64(materialize([null]));
34+
7883836132540916187
35+
select xxHash64(materialize([null, null]));
36+
4782716432907020049
37+
select xxHash64(materialize([null::Nullable(Int64)]));
38+
7883836132540916187
39+
select xxHash64(materialize([null::Nullable(String)]));
40+
7883836132540916187
41+
select xxHash64(materialize(tuple()));
42+
16324913028386710556
43+
select xxHash64(materialize(tuple(null)));
44+
14234421406067173489
45+
select xxHash64(materialize(tuple(null, null)));
46+
16006964495223566989
47+
select xxHash64(materialize(tuple(null::Nullable(Int64))));
48+
14234421406067173489
49+
select xxHash64(materialize(tuple(null::Nullable(String))));
50+
14234421406067173489
51+
create table test_hash_on_null (a Array(Nullable(Int64))) engine Memory;
52+
insert into test_hash_on_null values (null) ([null, null]);
53+
select xxHash32(a) from test_hash_on_null;
54+
4249604106
55+
473961067
56+
select cityHash64([1]);
57+
2963573356302499406
58+
select cityHash64([toNullable(1)]);
59+
2963573356302499406
60+
select cityHash64('hi');
61+
17851379961049682469
62+
select cityHash64(tuple('hi'));
63+
17851379961049682469
64+
select cityHash64(tuple(toNullable('hi')));
65+
17851379961049682469
66+
select cityHash64(tuple(toLowCardinality(toNullable('hi'))));
67+
17851379961049682469
68+
select cityHash64(materialize(tuple(toLowCardinality(toNullable('hi')))));
69+
17851379961049682469
70+
create table test_mix_null (a Nullable(Int64)) engine Memory;
71+
insert into test_mix_null values (null) (toNullable(4)) (null) (toNullable(4454559));
72+
select a, xxHash32(a), xxHash32(tuple(a)) from test_mix_null;
73+
\N \N 64596081
74+
4 4160678787 4160678787
75+
\N \N 64596081
76+
4454559 443946719 443946719
77+
create table t (a Array(Tuple(x Nullable(Int64), y Map(Int64, Nullable(String)), z LowCardinality(Nullable(FixedString(16)))))) engine Memory;
78+
insert into t values ([(null, map(10, null, 20, 'meow', 30, '', 40, null), 'fs'), (42, map(), null)]), ([]), ([(null, map(), null)]), ([(null, map(1, null), null), (1, map(2, 'hi'), 3)]);
79+
select reinterpret(sipHash128(tuple(*)), 'UInt128') from t;
80+
86561178484294891618738366568552299261
81+
4249604106
82+
300414384037013415667225390679639372212
83+
224640663760774045608948914723898661943
84+
select cityHash64(tuple(*)) from t;
85+
1608053812622067875
86+
4761183170873013810
87+
17833311824296618694
88+
6925065585755071929
89+
select cityHash64(*) from t;
90+
1608053812622067875
91+
4761183170873013810
92+
17833311824296618694
93+
6925065585755071929
94+
select cityHash64(a.x) from t;
95+
16588999816657690590
96+
4761183170873013810
97+
7883836132540916187
98+
15527531030475525876
99+
select cityHash64(a.y) from t;
100+
1494474170843716293
101+
4761183170873013810
102+
10275428604011817656
103+
11263809815669465501
104+
select cityHash64(a.z) from t;
105+
5826723076831594532
106+
4761183170873013810
107+
7883836132540916187
108+
3510899528476195174
109+
--- Keyed.
110+
select sipHash64Keyed(materialize((1::UInt64, 2::UInt64)), null) from numbers(2);
111+
\N
112+
\N
113+
select sipHash64Keyed((1::UInt64, 2::UInt64), tuple(null)) from numbers(2);
114+
14480900820145544791
115+
14480900820145544791
116+
select sipHash64Keyed(materialize((1::UInt64, 2::UInt64)), tuple(null)) from numbers(2);
117+
14480900820145544791
118+
14480900820145544791
119+
select sipHash64Keyed((1::UInt64, number), tuple(null)) from numbers(3);
120+
11937733673908139404
121+
17037385873844859331
122+
14480900820145544791
123+
-- Make sure all types are allowed.
124+
select sum(ignore(cityHash64(tuple(*)))) from (select * from generateRandom() limit 100);
125+
0
Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
-- { echoOn }
2+
3+
select xxHash32(null);
4+
select xxHash64(null);
5+
select xxHash64([]);
6+
select xxHash64([null]);
7+
select xxHash64([null, null]);
8+
select xxHash64([null::Nullable(Int64)]);
9+
select xxHash64([null::Nullable(String)]);
10+
select xxHash64(tuple());
11+
select xxHash64(tuple(null));
12+
select xxHash64(tuple(null, null));
13+
select xxHash64(tuple(null::Nullable(Int64)));
14+
select xxHash64(tuple(null::Nullable(String)));
15+
16+
select xxHash32(materialize(null));
17+
select xxHash64(materialize(null));
18+
select xxHash64(materialize([]));
19+
select xxHash64(materialize([null]));
20+
select xxHash64(materialize([null, null]));
21+
select xxHash64(materialize([null::Nullable(Int64)]));
22+
select xxHash64(materialize([null::Nullable(String)]));
23+
select xxHash64(materialize(tuple()));
24+
select xxHash64(materialize(tuple(null)));
25+
select xxHash64(materialize(tuple(null, null)));
26+
select xxHash64(materialize(tuple(null::Nullable(Int64))));
27+
select xxHash64(materialize(tuple(null::Nullable(String))));
28+
29+
create table test_hash_on_null (a Array(Nullable(Int64))) engine Memory;
30+
insert into test_hash_on_null values (null) ([null, null]);
31+
select xxHash32(a) from test_hash_on_null;
32+
33+
select cityHash64([1]);
34+
select cityHash64([toNullable(1)]);
35+
select cityHash64('hi');
36+
select cityHash64(tuple('hi'));
37+
select cityHash64(tuple(toNullable('hi')));
38+
select cityHash64(tuple(toLowCardinality(toNullable('hi'))));
39+
select cityHash64(materialize(tuple(toLowCardinality(toNullable('hi')))));
40+
41+
create table test_mix_null (a Nullable(Int64)) engine Memory;
42+
insert into test_mix_null values (null) (toNullable(4)) (null) (toNullable(4454559));
43+
select a, xxHash32(a), xxHash32(tuple(a)) from test_mix_null;
44+
45+
create table t (a Array(Tuple(x Nullable(Int64), y Map(Int64, Nullable(String)), z LowCardinality(Nullable(FixedString(16)))))) engine Memory;
46+
insert into t values ([(null, map(10, null, 20, 'meow', 30, '', 40, null), 'fs'), (42, map(), null)]), ([]), ([(null, map(), null)]), ([(null, map(1, null), null), (1, map(2, 'hi'), 3)]);
47+
select reinterpret(sipHash128(tuple(*)), 'UInt128') from t;
48+
select cityHash64(tuple(*)) from t;
49+
select cityHash64(*) from t;
50+
select cityHash64(a.x) from t;
51+
select cityHash64(a.y) from t;
52+
select cityHash64(a.z) from t;
53+
54+
--- Keyed.
55+
select sipHash64Keyed(materialize((1::UInt64, 2::UInt64)), null) from numbers(2);
56+
select sipHash64Keyed((1::UInt64, 2::UInt64), tuple(null)) from numbers(2);
57+
select sipHash64Keyed(materialize((1::UInt64, 2::UInt64)), tuple(null)) from numbers(2);
58+
select sipHash64Keyed((1::UInt64, number), tuple(null)) from numbers(3);
59+
60+
-- Make sure all types are allowed.
61+
select sum(ignore(cityHash64(tuple(*)))) from (select * from generateRandom() limit 100);

0 commit comments

Comments
 (0)