-
Notifications
You must be signed in to change notification settings - Fork 4k
Closed
Labels
Component: C++Critical FixBugfixes for security vulnerabilities, crashes, or invalid data.Bugfixes for security vulnerabilities, crashes, or invalid data.Priority: CriticalType: bug
Milestone
Description
Describe the bug, including details regarding any error messages, version, and platform.
When debugging the test in #43046, I accidentally found that the offsets within the row table were overflowing (which is much worse than #43046 itself). The suspected code is:
arrow/cpp/src/arrow/compute/row/row_internal.cc
Lines 328 to 329 in 0c4d6c7
| uint32_t total_length = to_offsets[num_rows_]; | |
| uint32_t total_length_to_append = 0; |
Here is a complete test that exposes this issue (note you'll need #43065 to really replicate this issue, otherwise the test crashes first):
// Compare columns to rows of row ids larger than 2^31 within a row table.
// Certain AVX2 instructions may behave unexpectedly causing troubles like GH-43046.
TEST(KeyCompare, LARGE_MEMORY_TEST(CompareColumnsToRowsMany)) {
if constexpr (sizeof(void*) == 4) {
GTEST_SKIP() << "Test only works on 64-bit platforms";
}
// The idea of this case is to create a row table containing one fixed length column and
// one var length column (so the row is hence var length and has offset buffer), by
// appending the same small batch of n rows repeatedly until it has more than 2^31 rows.
// Then compare the last n rows of the row table with the batch.
constexpr int64_t num_rows_batch = std::numeric_limits<uint16_t>::max();
constexpr int64_t num_rows_row_table =
(std::numeric_limits<int32_t>::max() + 1ll) / num_rows_batch * num_rows_batch +
num_rows_batch;
MemoryPool* pool = default_memory_pool();
// The left side columns with num_rows_batch rows.
std::vector<KeyColumnArray> columns_left;
ExecBatch batch_left;
{
std::vector<Datum> values;
// A fixed length array containing random values.
ASSERT_OK_AND_ASSIGN(auto value_fixed_length,
::arrow::gen::Random(uint32())->Generate(num_rows_batch));
values.push_back(std::move(value_fixed_length));
// A var length array containing small var length values ("X").
ASSERT_OK_AND_ASSIGN(auto value_var_length,
::arrow::gen::Constant(std::make_shared<BinaryScalar>("X"))
->Generate(num_rows_batch));
values.push_back(std::move(value_var_length));
batch_left = ExecBatch(std::move(values), num_rows_batch);
ASSERT_OK(ColumnArraysFromExecBatch(batch_left, &columns_left));
}
// The right side row table with num_rows_row_table rows.
RowTableImpl row_table_right;
{
// Encode the row table with the left columns repeatedly.
std::vector<KeyColumnMetadata> column_metadatas;
ASSERT_OK(ColumnMetadatasFromExecBatch(batch_left, &column_metadatas));
RowTableMetadata table_metadata;
table_metadata.FromColumnMetadataVector(column_metadatas, sizeof(uint64_t),
sizeof(uint64_t));
ASSERT_OK(row_table_right.Init(pool, table_metadata));
RowTableImpl row_table_batch;
ASSERT_OK(row_table_batch.Init(pool, table_metadata));
std::vector<uint16_t> row_ids(num_rows_batch);
std::iota(row_ids.begin(), row_ids.end(), 0);
RowTableEncoder row_encoder;
row_encoder.Init(column_metadatas, sizeof(uint64_t), sizeof(uint64_t));
row_encoder.PrepareEncodeSelected(0, num_rows_batch, columns_left);
ASSERT_OK(row_encoder.EncodeSelected(
&row_table_batch, static_cast<uint32_t>(num_rows_batch), row_ids.data()));
for (int i = 0; i < num_rows_row_table / num_rows_batch; ++i) {
ASSERT_OK(row_table_right.AppendSelectionFrom(row_table_batch, num_rows_batch,
/*source_row_ids=*/NULLPTR));
}
// The row table must contain an offset buffer.
ASSERT_NE(row_table_right.offsets(), NULLPTR);
ASSERT_EQ(row_table_right.length(), num_rows_row_table);
}
// The rows to compare: all rows in the batch to the last num_rows_batch rows of the
// row table.
std::vector<uint32_t> row_ids_to_compare(num_rows_batch);
std::iota(row_ids_to_compare.begin(), row_ids_to_compare.end(),
num_rows_row_table - num_rows_batch);
TempVectorStack stack;
ASSERT_OK(
stack.Init(pool, KeyCompare::CompareColumnsToRowsTempStackUsage(num_rows_batch)));
LightContext ctx{CpuInfo::GetInstance()->hardware_flags(), &stack};
{
// No selection, output no match row ids.
uint32_t num_rows_no_match;
std::vector<uint16_t> row_ids_out(num_rows_batch);
KeyCompare::CompareColumnsToRows(num_rows_batch, /*sel_left_maybe_null=*/NULLPTR,
row_ids_to_compare.data(), &ctx, &num_rows_no_match,
row_ids_out.data(), columns_left, row_table_right,
/*are_cols_in_encoding_order=*/true,
/*out_match_bitvector_maybe_null=*/NULLPTR);
ASSERT_EQ(num_rows_no_match, 0);
}
{
// No selection, output match bit vector.
std::vector<uint8_t> match_bitvector(BytesForBits(num_rows_batch));
KeyCompare::CompareColumnsToRows(
num_rows_batch, /*sel_left_maybe_null=*/NULLPTR, row_ids_to_compare.data(), &ctx,
/*out_num_rows=*/NULLPTR, /*out_sel_left_maybe_same=*/NULLPTR, columns_left,
row_table_right,
/*are_cols_in_encoding_order=*/true, match_bitvector.data());
ASSERT_EQ(arrow::internal::CountSetBits(match_bitvector.data(), 0, num_rows_batch),
num_rows_batch);
}
std::vector<uint16_t> selection_left(num_rows_batch);
std::iota(selection_left.begin(), selection_left.end(), 0);
{
// With selection, output no match row ids.
uint32_t num_rows_no_match;
std::vector<uint16_t> row_ids_out(num_rows_batch);
KeyCompare::CompareColumnsToRows(num_rows_batch, selection_left.data(),
row_ids_to_compare.data(), &ctx, &num_rows_no_match,
row_ids_out.data(), columns_left, row_table_right,
/*are_cols_in_encoding_order=*/true,
/*out_match_bitvector_maybe_null=*/NULLPTR);
ASSERT_EQ(num_rows_no_match, 0);
}
{
// With selection, output match bit vector.
std::vector<uint8_t> match_bitvector(BytesForBits(num_rows_batch));
KeyCompare::CompareColumnsToRows(
num_rows_batch, selection_left.data(), row_ids_to_compare.data(), &ctx,
/*out_num_rows=*/NULLPTR, /*out_sel_left_maybe_same=*/NULLPTR, columns_left,
row_table_right,
/*are_cols_in_encoding_order=*/true, match_bitvector.data());
ASSERT_EQ(arrow::internal::CountSetBits(match_bitvector.data(), 0, num_rows_batch),
num_rows_batch);
}
}Component(s)
C++
Reactions are currently unavailable
Metadata
Metadata
Assignees
Labels
Component: C++Critical FixBugfixes for security vulnerabilities, crashes, or invalid data.Bugfixes for security vulnerabilities, crashes, or invalid data.Priority: CriticalType: bug