Skip to content

Commit f6525f5

Browse files
committed
Fix bug in Variant offsets cache
1 parent 827aaff commit f6525f5

File tree

3 files changed

+107
-1
lines changed

3 files changed

+107
-1
lines changed

src/DataTypes/Serializations/SerializationVariant.cpp

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -729,7 +729,11 @@ void SerializationVariant::deserializeBinaryBulkWithMultipleStreams(
729729
}
730730
}
731731

732-
addColumnWithNumReadRowsToSubstreamsCache(cache, settings.path, col.getOffsetsPtr(), col.getOffsetsPtr()->size() - prev_size);
732+
/// Don't cache offsets calculated from a temporary column (when insert_only_rows_in_current_range_from_substreams_cache is set).
733+
/// Such offsets are relative to the temporary column's variant sizes (starting from 0) and would be incorrect
734+
/// for other readers that have accumulated data from previous read ranges.
735+
if (!settings.insert_only_rows_in_current_range_from_substreams_cache)
736+
addColumnWithNumReadRowsToSubstreamsCache(cache, settings.path, col.getOffsetsPtr(), col.getOffsetsPtr()->size() - prev_size);
733737
}
734738
settings.path.pop_back();
735739

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
combined_and_subobject_vrow1
2+
0 \N {"b":{"c":0}}
3+
1 \N {"b":{"c":0}}
4+
2 \N {"b":{"c":0}}
5+
3 \N {"b":{"c":0}}
6+
4 \N {"b":{"c":0}}
7+
5 {"b":{"c":5}} {"b":{"c":5}}
8+
6 {"b":{"c":6}} {"b":{"c":6}}
9+
7 {"b":{"c":7}} {"b":{"c":7}}
10+
8 {"b":{"c":8}} {"b":{"c":8}}
11+
9 {"b":{"c":9}} {"b":{"c":9}}
12+
10 {"b":{"c":0,"d":10,"e":"str_10"}} {"b":{"c":0,"d":10,"e":"str_10"}}
13+
11 {"b":{"c":0,"d":11,"e":"str_11"}} {"b":{"c":0,"d":11,"e":"str_11"}}
14+
12 {"b":{"c":0,"d":12,"e":"str_12"}} {"b":{"c":0,"d":12,"e":"str_12"}}
15+
13 {"b":{"c":0,"d":13,"e":"str_13"}} {"b":{"c":0,"d":13,"e":"str_13"}}
16+
14 {"b":{"c":0,"d":14,"e":"str_14"}} {"b":{"c":0,"d":14,"e":"str_14"}}
17+
20 {"b":{"c":20,"d":20,"e":"str_20"}} {"b":{"c":20,"d":20,"e":"str_20"}}
18+
21 {"b":{"c":21,"d":21,"e":"str_21"}} {"b":{"c":21,"d":21,"e":"str_21"}}
19+
22 {"b":{"c":22,"d":22,"e":"str_22"}} {"b":{"c":22,"d":22,"e":"str_22"}}
20+
23 {"b":{"c":23,"d":23,"e":"str_23"}} {"b":{"c":23,"d":23,"e":"str_23"}}
21+
24 {"b":{"c":24,"d":24,"e":"str_24"}} {"b":{"c":24,"d":24,"e":"str_24"}}
22+
30 {"b":{"c":30,"d":[0],"e":"str_30"}} {"b":{"c":30,"d":[0],"e":"str_30"}}
23+
31 {"b":{"c":31,"d":[0,1],"e":"str_31"}} {"b":{"c":31,"d":[0,1],"e":"str_31"}}
24+
32 {"b":{"c":32,"d":[0,1,2],"e":"str_32"}} {"b":{"c":32,"d":[0,1,2],"e":"str_32"}}
25+
33 {"b":{"c":33,"d":[0,1,2,3],"e":"str_33"}} {"b":{"c":33,"d":[0,1,2,3],"e":"str_33"}}
26+
34 {"b":{"c":34,"d":[0,1,2,3,4],"e":"str_34"}} {"b":{"c":34,"d":[0,1,2,3,4],"e":"str_34"}}
27+
combined_and_subobject_vrow0
28+
0 \N {"b":{"c":0}}
29+
1 \N {"b":{"c":0}}
30+
2 \N {"b":{"c":0}}
31+
3 \N {"b":{"c":0}}
32+
4 \N {"b":{"c":0}}
33+
5 {"b":{"c":5}} {"b":{"c":5}}
34+
6 {"b":{"c":6}} {"b":{"c":6}}
35+
7 {"b":{"c":7}} {"b":{"c":7}}
36+
8 {"b":{"c":8}} {"b":{"c":8}}
37+
9 {"b":{"c":9}} {"b":{"c":9}}
38+
10 {"b":{"c":0,"d":10,"e":"str_10"}} {"b":{"c":0,"d":10,"e":"str_10"}}
39+
11 {"b":{"c":0,"d":11,"e":"str_11"}} {"b":{"c":0,"d":11,"e":"str_11"}}
40+
12 {"b":{"c":0,"d":12,"e":"str_12"}} {"b":{"c":0,"d":12,"e":"str_12"}}
41+
13 {"b":{"c":0,"d":13,"e":"str_13"}} {"b":{"c":0,"d":13,"e":"str_13"}}
42+
14 {"b":{"c":0,"d":14,"e":"str_14"}} {"b":{"c":0,"d":14,"e":"str_14"}}
43+
20 {"b":{"c":20,"d":20,"e":"str_20"}} {"b":{"c":20,"d":20,"e":"str_20"}}
44+
21 {"b":{"c":21,"d":21,"e":"str_21"}} {"b":{"c":21,"d":21,"e":"str_21"}}
45+
22 {"b":{"c":22,"d":22,"e":"str_22"}} {"b":{"c":22,"d":22,"e":"str_22"}}
46+
23 {"b":{"c":23,"d":23,"e":"str_23"}} {"b":{"c":23,"d":23,"e":"str_23"}}
47+
24 {"b":{"c":24,"d":24,"e":"str_24"}} {"b":{"c":24,"d":24,"e":"str_24"}}
48+
30 {"b":{"c":30,"d":[0],"e":"str_30"}} {"b":{"c":30,"d":[0],"e":"str_30"}}
49+
31 {"b":{"c":31,"d":[0,1],"e":"str_31"}} {"b":{"c":31,"d":[0,1],"e":"str_31"}}
50+
32 {"b":{"c":32,"d":[0,1,2],"e":"str_32"}} {"b":{"c":32,"d":[0,1,2],"e":"str_32"}}
51+
33 {"b":{"c":33,"d":[0,1,2,3],"e":"str_33"}} {"b":{"c":33,"d":[0,1,2,3],"e":"str_33"}}
52+
34 {"b":{"c":34,"d":[0,1,2,3,4],"e":"str_34"}} {"b":{"c":34,"d":[0,1,2,3,4],"e":"str_34"}}
Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
-- Tags: no-fasttest, no-s3-storage, no-azure-blob-storage
2+
-- Tag: no-azure-blob-storage - too slow
3+
-- Tag: no-s3-storage - too slow
4+
-- Regression test: SerializationObjectCombinedPath cached zero-based variant offsets
5+
-- from temporary columns, poisoning the cache for SerializationSubObject which has
6+
-- accumulated data from previous read ranges. The bug is exposed when
7+
-- read_in_order_use_virtual_row=1 disables MergingSortedTransform batch passthrough,
8+
-- forcing row-by-row insertFrom that uses the corrupted offsets.
9+
10+
SET enable_json_type = 1;
11+
SET allow_experimental_variant_type = 1;
12+
SET use_variant_as_common_type = 1;
13+
SET session_timezone = 'UTC';
14+
15+
DROP TABLE IF EXISTS test_json_cache;
16+
CREATE TABLE test_json_cache (id UInt64, json JSON(max_dynamic_paths=1000, a.b.c UInt32))
17+
ENGINE = MergeTree ORDER BY id
18+
SETTINGS min_rows_for_wide_part=1, min_bytes_for_wide_part=1,
19+
object_serialization_version='v2',
20+
vertical_merge_algorithm_min_rows_to_activate=1,
21+
vertical_merge_algorithm_min_columns_to_activate=1;
22+
23+
-- Create 4 parts with different path combinations to produce a merged part
24+
-- with multiple marks/granules and mixed typed+dynamic paths.
25+
SYSTEM STOP MERGES test_json_cache;
26+
27+
INSERT INTO test_json_cache SELECT number, '{}' FROM numbers(5);
28+
INSERT INTO test_json_cache SELECT number, toJSONString(map('a.b.c', number)) FROM numbers(5, 5);
29+
INSERT INTO test_json_cache SELECT number, toJSONString(map('a.b.d', number::UInt32, 'a.b.e', 'str_' || toString(number))) FROM numbers(10, 5);
30+
INSERT INTO test_json_cache SELECT number, toJSONString(map('a.b.c', number, 'a.b.d', number::UInt32, 'a.b.e', 'str_' || toString(number))) FROM numbers(20, 5);
31+
32+
-- Merge all 4 parts into one, producing a single part with multiple granules.
33+
SYSTEM START MERGES test_json_cache;
34+
OPTIMIZE TABLE test_json_cache FINAL SETTINGS mutations_sync=1;
35+
SYSTEM STOP MERGES test_json_cache;
36+
37+
-- Add a second part with Array(UInt32) for a.b.d to test variant type diversity.
38+
INSERT INTO test_json_cache SELECT number, toJSONString(map('a.b.c', number, 'a.b.d', range(number % 5 + 1)::Array(UInt32), 'a.b.e', 'str_' || toString(number), 'd.a', number::UInt32, 'd.c', toDate(number))) FROM numbers(30, 5);
39+
40+
-- Now we have 2 parts. Reading with ORDER BY id triggers MergingSortedTransform.
41+
-- With read_in_order_use_virtual_row=1, batch passthrough is disabled and
42+
-- row-by-row insertFrom exposes the corrupted variant offsets in ^a.
43+
44+
-- Both queries must produce identical results.
45+
SELECT 'combined_and_subobject_vrow1';
46+
SELECT id, json.@a, json.^a FROM test_json_cache ORDER BY id SETTINGS read_in_order_use_virtual_row=1;
47+
SELECT 'combined_and_subobject_vrow0';
48+
SELECT id, json.@a, json.^a FROM test_json_cache ORDER BY id SETTINGS read_in_order_use_virtual_row=0;
49+
50+
DROP TABLE test_json_cache;

0 commit comments

Comments
 (0)