Describe the bug
When concatenating lists of dictionary, the new dictionary type contains duplicate
To Reproduce
#[cfg(test)]
mod tests {
use arrow_array::builder::{GenericListBuilder, StringDictionaryBuilder};
use arrow_array::cast::AsArray;
use arrow_array::types::Int32Type;
use arrow_array::{ArrayRef, StringArray};
use std::hash::Hash;
use std::sync::Arc;
#[test]
fn it_works() {
let scalars = vec![
create_list_of_dict(vec![Some("a")]),
create_list_of_dict(vec![Some("a")]),
create_list_of_dict(vec![Some("b")]),
];
let arrays = scalars.iter().map(|a| a.as_ref()).collect::<Vec<_>>();
let concat_res = arrow::compute::concat(arrays.as_slice()).unwrap();
let list = concat_res.as_list::<i32>();
let dict = list.values().as_dictionary::<Int32Type>().downcast_dict::<StringArray>().unwrap();
println!("{:?}", dict);
let values = dict.values().iter().collect::<Vec<_>>();
let mut unique_values = values.clone();
unique_values.dedup();
assert_eq!(values, unique_values, "There are duplicates in the value list");
}
fn create_list_of_dict(items: Vec<Option<&'static str>>) -> ArrayRef {
let mut builder = GenericListBuilder::<i32, _>::new(StringDictionaryBuilder::<Int32Type>::new());
for item in items {
builder.values().append_option(item);
}
builder.append(true);
Arc::new(builder.finish())
}
}
Expected behavior
when concatenating lists of dictionary, it should merge the dictionary rather than blindly concat them as well
Additional context
this happened to me when creating aggregate expression in DataFusion which calls ScalarValue::iter_to_array(results); which use concat underneath
According to the spec duplicate values are valid:
Note that a dictionary is permitted to contain duplicate values or nulls:
From Arrow spec - Dictionary-encoded Layout