-
Notifications
You must be signed in to change notification settings - Fork 2k
Closed
Labels
bugSomething isn't workingSomething isn't working
Description
Describe the bug
Array::is_null for dictionary arrays doesn't account for nulls in the values array. This causes hard-to-debug issues.
For example, the dictionary below contains only null values. When we run a distinct count on it, the result is sometimes 1 (incorrect) and sometimes 0.
To Reproduce
use arrow::array::{ArrayRef, DictionaryArray, Int32Array, RecordBatch, StringArray};
use arrow::datatypes::{DataType, Field, Schema};
use datafusion::catalog::MemTable;
use datafusion::prelude::SessionContext;
use datafusion_common::ScalarValue;
use datafusion_execution::config::SessionConfig;
use std::sync::Arc;
#[tokio::main]
async fn main() -> anyhow::Result<()> {
let n: usize = 5;
let num = Arc::new(Int32Array::from((0..n as _).collect::<Vec<i32>>())) as ArrayRef;
let dict_values = StringArray::from(vec![None, Some("abc")]);
let dict_indices = Int32Array::from(vec![0; n]);
// all idx point to 0 - which means that all values in the dictionary are None
let dict = DictionaryArray::new(dict_indices, Arc::new(dict_values) as ArrayRef);
let schema = Arc::new(Schema::new(vec![
Field::new("num1", DataType::Int32, false),
Field::new("num2", DataType::Int32, false), // num2 to disable SingleDistinctToGroupBy optimisation
Field::new(
"dict",
DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)),
true,
),
]));
let batch = RecordBatch::try_new(
schema.clone(),
vec![num.clone(), num.clone(), Arc::new(dict)],
)?;
let provider = MemTable::try_new(schema, vec![vec![batch]])?;
let mut session_config = SessionConfig::default();
session_config = session_config.set(
"datafusion.execution.target_partitions",
&ScalarValue::UInt64(Some(1u64)), // won't work with more than 1 partition
);
let ctx = SessionContext::new_with_config(session_config);
ctx.register_table("t", Arc::new(provider))?;
let df = ctx.sql("select count(distinct dict), count(num2) from t group by num1").await?;
// count(distinct ...) doesn't count None values, so the result should be 0
df.show().await?;
Ok(())
}This will produce
+------------------------+---------------+
| count(DISTINCT t.dict) | count(t.num2) |
+------------------------+---------------+
| 1 | 1 |
| 1 | 1 |
| 1 | 1 |
| 1 | 1 |
| 1 | 1 |
+------------------------+---------------+
dict only has null values so the correct result should be 0
If you change target partitions to 2 or remove num2 the results will be 0 for all columns which is correct
Expected behavior
We should either disallow nulls in dictionary values or handle them explicitly (the latter is preferable IMO).
Additional context
The fix will likely be in the Arrow repo, but created this issue for discussion and visibility.
Reactions are currently unavailable
Metadata
Metadata
Assignees
Labels
bugSomething isn't workingSomething isn't working