Add StringViewArray and BinaryViewArray (#4253)#4585
Add StringViewArray and BinaryViewArray (#4253)#4585tustvold wants to merge 3 commits intoapache:masterfrom
Conversation
|
Related mailing list discussion: https://lists.apache.org/thread/w88tpz76ox8h3rxkjl4so6rg3f1rv7wt |
arrow-data/src/view.rs
Outdated
There was a problem hiding this comment.
The "small string optimization" in rust !!! -- I had wondered what this would look like
bkietz
left a comment
There was a problem hiding this comment.
This looks good to me, I have just a few suggestions
| /// View data buffers | ||
| /// This is not stored in `MutableArrayData` because these values constant and only needed | ||
| /// at the end, when freezing [_MutableArrayData]. | ||
| view_buffers: Vec<Buffer>, |
There was a problem hiding this comment.
I'd recommend renaming this to avoid the suggestion that these buffers hold views
| /// View data buffers | |
| /// This is not stored in `MutableArrayData` because these values constant and only needed | |
| /// at the end, when freezing [_MutableArrayData]. | |
| view_buffers: Vec<Buffer>, | |
| /// Variadic data buffers referenced by views | |
| /// This is not stored in `MutableArrayData` because these values constant and only needed | |
| /// at the end, when freezing [_MutableArrayData]. | |
| variadic_character_buffers: Vec<Buffer>, |
| let lhs_b = lhs.buffers(); | ||
| let rhs_views = &rhs.buffer::<u128>(0)[rhs_start..rhs_start + len]; | ||
| let rhs_b = rhs.buffers(); | ||
|
|
||
| for (idx, (l, r)) in lhs_views.iter().zip(rhs_views).enumerate() { | ||
| // Only checking one null mask here because by the time the control flow reaches | ||
| // this point, the equality of the two masks would have already been verified. | ||
| if lhs.is_null(idx) { | ||
| continue; | ||
| } | ||
|
|
||
| let l_len = *l as u32; | ||
| let r_len = *r as u32; | ||
| if l_len != r_len { | ||
| return false; | ||
| } else if l_len <= 12 { | ||
| // Inline storage | ||
| if l != r { | ||
| return false; | ||
| } | ||
| } else { | ||
| let l_view = View::from(*l); | ||
| let r_view = View::from(*r); | ||
| let l_b = &lhs_b[(l_view.buffer_index as usize) + 1]; | ||
| let r_b = &rhs_b[(r_view.buffer_index as usize) + 1]; | ||
|
|
||
| let l_o = l_view.offset as usize; | ||
| let r_o = r_view.offset as usize; | ||
| let len = l_len as usize; | ||
| if l_b[l_o..l_o + len] != r_b[r_o..r_o + len] { | ||
| return false; | ||
| } | ||
| } | ||
| } | ||
| true |
There was a problem hiding this comment.
For equality comparison, we can compare the length and prefix of the views simultaneously:
| let lhs_b = lhs.buffers(); | |
| let rhs_views = &rhs.buffer::<u128>(0)[rhs_start..rhs_start + len]; | |
| let rhs_b = rhs.buffers(); | |
| for (idx, (l, r)) in lhs_views.iter().zip(rhs_views).enumerate() { | |
| // Only checking one null mask here because by the time the control flow reaches | |
| // this point, the equality of the two masks would have already been verified. | |
| if lhs.is_null(idx) { | |
| continue; | |
| } | |
| let l_len = *l as u32; | |
| let r_len = *r as u32; | |
| if l_len != r_len { | |
| return false; | |
| } else if l_len <= 12 { | |
| // Inline storage | |
| if l != r { | |
| return false; | |
| } | |
| } else { | |
| let l_view = View::from(*l); | |
| let r_view = View::from(*r); | |
| let l_b = &lhs_b[(l_view.buffer_index as usize) + 1]; | |
| let r_b = &rhs_b[(r_view.buffer_index as usize) + 1]; | |
| let l_o = l_view.offset as usize; | |
| let r_o = r_view.offset as usize; | |
| let len = l_len as usize; | |
| if l_b[l_o..l_o + len] != r_b[r_o..r_o + len] { | |
| return false; | |
| } | |
| } | |
| } | |
| true | |
| let lhs_b = lhs.buffers()[1..]; | |
| let rhs_views = &rhs.buffer::<u128>(0)[rhs_start..rhs_start + len]; | |
| let rhs_b = rhs.buffers()[1..]; | |
| for (idx, (l, r)) in lhs_views.iter().zip(rhs_views).enumerate() { | |
| // Only checking one null mask here because by the time the control flow reaches | |
| // this point, the equality of the two masks would have already been verified. | |
| if lhs.is_null(idx) { | |
| continue; | |
| } | |
| let l_len_prefix = *l as u64; | |
| let r_len_prefix = *r as u64; | |
| if l_len_prefix != r_len_prefix { | |
| return false; | |
| } | |
| let len = *l as u32; | |
| if len <= 12 { | |
| if (*l >> 64) as u64 != (*r >> 64) as u64 { | |
| return false; | |
| } | |
| continue; | |
| } | |
| let l_view = View::from(*l); | |
| let r_view = View::from(*r); | |
| let l_b = &lhs_b[l_view.buffer_index as usize]; | |
| let r_b = &rhs_b[r_view.buffer_index as usize]; | |
| // prefixes are already known to be equal; skip checking them | |
| let len = len as usize - 4; | |
| let l_o = l_view.offset as usize + 4; | |
| let r_o = r_view.offset as usize + 4; | |
| if l_b[l_o..l_o + len] != r_b[r_o..r_o + len] { | |
| return false; | |
| } | |
| } | |
| true |
There was a problem hiding this comment.
I'd have hoped LLVM is smart enough to work this out but I can double-check
|
See #5374 for implementation discussions |
|
|
||
| /// An array of variable length byte view arrays | ||
| pub struct GenericByteViewArray<T: ByteViewType> { | ||
| data_type: DataType, |
There was a problem hiding this comment.
Why we need this? Can we just use T::DATA_TYPE?
| let v: &[u8] = value.as_ref().as_ref(); | ||
| let length: u32 = v.len().try_into().unwrap(); | ||
| if length <= 12 { | ||
| let mut offset = [0; 16]; |
There was a problem hiding this comment.
I'd recommend renaming offset to view_buffer.
| self.len() | ||
| ); | ||
|
|
||
| assert!(i < self.views.len()); |
There was a problem hiding this comment.
This assertion is duplicate with the previous one.
| } | ||
| } else { | ||
| let l_view = View::from(*l); | ||
| let r_view = View::from(*r); |
There was a problem hiding this comment.
We can compare the prefix to short-circuit.
As described in paper:
In case of long strings, the remaining four bytes
of the header are used to store the first four characters of the string,
allowing Umbra to short-circuit some comparisons
There was a problem hiding this comment.
Oh, I just note it's commend already. ignore this.
| /// The element layout of a view buffer | ||
| /// | ||
| /// See [`DataType::Utf8View`](arrow_schema::DataType) | ||
| pub struct View { |
There was a problem hiding this comment.
do we need to use #[repr(C)] ?
| buffers: &[Buffer], | ||
| ) -> Result<(), ArrowError> { | ||
| validate_view_impl(views, buffers, |idx, b| { | ||
| std::str::from_utf8(b).map_err(|e| { |
There was a problem hiding this comment.
simdutf8 crate may be better than this.
|
Thanks for the comments @sundy-li . Is there any chance you or someone at DataBend are interested in / planning on working on this feature? |
Yes, we are working on it at databendlabs/databend#14662 In databend, our column memory model is still based on arrow2. But reading/writing is using arrow-rs, we are planing to work on it after databendlabs/databend#14662 is finished. |
|
Hi @alamb, I'm willing to work on this feature after databendlabs/databend#14662 is finished. |
Thank you @sundy-li and @ariesdevil I left a comment on #5374 #5374 (comment) about how we can potentially work on this together |
|
I'm going to close this PR as I believe others are picking this up |
Indeed -- we are tracking the work in #5374 |
Draft as not yet standardised and needs a LOT more testing
Which issue does this PR close?
Closes #.
Rationale for this change
What changes are included in this PR?
Are there any user-facing changes?