-
Notifications
You must be signed in to change notification settings - Fork 1.1k
Closed
Labels
Description
Describe the bug
The size of RLE encoded data is routinely estimated as
RleEncoder::min_buffer_size(bit_width)
+ RleEncoder::max_buffer_size(bit_width, self.indices.len())
Where RleEncoder::min_buffer_size is defined as
let max_bit_packed_run_size = 1 + bit_util::ceil(
(MAX_VALUES_PER_BIT_PACKED_RUN * bit_width as usize) as i64,
8,
);
let max_rle_run_size =
bit_util::MAX_VLQ_BYTE_LEN + bit_util::ceil(bit_width as i64, 8) as usize;
std::cmp::max(max_bit_packed_run_size as usize, max_rle_run_size)
In practice this will almost always be 64 * bit_width.
let bytes_per_run = bit_width;
let num_runs = bit_util::ceil(num_values as i64, 8) as usize;
let bit_packed_max_size = num_runs + num_runs * bytes_per_run as usize;
let min_rle_run_size = 1 + bit_util::ceil(bit_width as i64, 8) as usize;
let rle_max_size =
bit_util::ceil(num_values as i64, 8) as usize * min_rle_run_size;
std::cmp::max(bit_packed_max_size, rle_max_size) as usize
To Reproduce
It is unclear why min_buffer_size is included in the size estimation at all
Expected behavior
A more accurate size estimation of written RLE encoded data
Additional context
Reactions are currently unavailable