Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
405 changes: 297 additions & 108 deletions cpp/src/arrow/compute/exec/expression.cc

Large diffs are not rendered by default.

7 changes: 5 additions & 2 deletions cpp/src/arrow/compute/exec/expression.h
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,8 @@ class ARROW_EXPORT Expression {
/// Return true if this expression is literal and entirely null.
bool IsNullLiteral() const;

/// Return true if this expression could evaluate to true.
/// Return true if this expression could evaluate to true. Will return true for any
/// unbound, non-boolean, or unsimplified Expressions
bool IsSatisfiable() const;

// XXX someday
Expand Down Expand Up @@ -171,8 +172,10 @@ std::vector<FieldRef> FieldsInExpression(const Expression&);
ARROW_EXPORT
bool ExpressionHasFieldRefs(const Expression&);

/// Assemble a mapping from field references to known values.
struct ARROW_EXPORT KnownFieldValues;

/// Assemble a mapping from field references to known values. This derives known values
/// from "equal" and "is_null" Expressions referencing a field and a literal.
ARROW_EXPORT
Result<KnownFieldValues> ExtractKnownFieldValues(
const Expression& guaranteed_true_predicate);
Expand Down
137 changes: 120 additions & 17 deletions cpp/src/arrow/compute/exec/expression_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,10 @@ Expression cast(Expression argument, std::shared_ptr<DataType> to_type) {
compute::CastOptions::Safe(std::move(to_type)));
}

Expression true_unless_null(Expression argument) {
return call("true_unless_null", {std::move(argument)});
}

template <typename Actual, typename Expected>
void ExpectResultsEqual(Actual&& actual, Expected&& expected) {
using MaybeActual = typename EnsureResult<typename std::decay<Actual>::type>::type;
Expand Down Expand Up @@ -250,8 +254,8 @@ TEST(Expression, ToString) {
EXPECT_EQ(literal(3).ToString(), "3");
EXPECT_EQ(literal("a").ToString(), "\"a\"");
EXPECT_EQ(literal("a\nb").ToString(), "\"a\\nb\"");
EXPECT_EQ(literal(std::make_shared<BooleanScalar>()).ToString(), "null");
EXPECT_EQ(literal(std::make_shared<Int64Scalar>()).ToString(), "null");
EXPECT_EQ(literal(std::make_shared<BooleanScalar>()).ToString(), "null[bool]");
EXPECT_EQ(literal(std::make_shared<Int64Scalar>()).ToString(), "null[int64]");
EXPECT_EQ(literal(std::make_shared<BinaryScalar>(Buffer::FromString("az"))).ToString(),
"\"617A\"");

Expand Down Expand Up @@ -388,29 +392,49 @@ TEST(Expression, IsScalarExpression) {
}

TEST(Expression, IsSatisfiable) {
auto Bind = [](Expression expr) { return expr.Bind(*kBoringSchema).ValueOrDie(); };

EXPECT_TRUE(literal(true).IsSatisfiable());
EXPECT_FALSE(literal(false).IsSatisfiable());

auto null = std::make_shared<BooleanScalar>();
EXPECT_FALSE(literal(null).IsSatisfiable());

EXPECT_TRUE(field_ref("a").IsSatisfiable());
// NB: no implicit conversion to bool
EXPECT_TRUE(literal(0).IsSatisfiable());

EXPECT_TRUE(field_ref("i32").IsSatisfiable());
EXPECT_TRUE(Bind(field_ref("i32")).IsSatisfiable());

EXPECT_TRUE(equal(field_ref("a"), literal(1)).IsSatisfiable());
EXPECT_TRUE(equal(field_ref("i32"), literal(1)).IsSatisfiable());
EXPECT_TRUE(Bind(equal(field_ref("i32"), literal(1))).IsSatisfiable());

// NB: no constant folding here
EXPECT_TRUE(equal(literal(0), literal(1)).IsSatisfiable());

// When a top level conjunction contains an Expression which is certain to evaluate to
// null, it can only evaluate to null or false.
auto never_true = and_(literal(null), field_ref("a"));
// This may appear in satisfiable filters if coalesced (for example, wrapped in fill_na)
EXPECT_TRUE(call("is_null", {never_true}).IsSatisfiable());
// ... but at the top level it is not satisfiable.
EXPECT_TRUE(Bind(equal(literal(0), literal(1))).IsSatisfiable());

// Special case invert(true_unless_null(x)): arises in simplification against a
// guarantee with a nullable caveat.
EXPECT_FALSE(Bind(not_(true_unless_null(field_ref("i32")))).IsSatisfiable());
// NB: no effort to examine unbound expressions
EXPECT_TRUE(not_(true_unless_null(field_ref("i32"))).IsSatisfiable());

// When a top level conjunction contains an Expression which is not satisfiable
// (guaranteed to evaluate to null or false), it can only evaluate to null or false.
// This special case arises when (for example) an absent column has made
// one member of the conjunction always-null. This is fairly common and
// would be a worthwhile optimization to support.
// EXPECT_FALSE(null_or_false).IsSatisfiable());
// one member of the conjunction always-null.
for (const auto& never_true : {
// N.B. this is "and_kleene"
and_(literal(false), field_ref("bool")),
and_(literal(null), field_ref("bool")),
call("and", {literal(false), field_ref("bool")}),
call("and", {literal(null), field_ref("bool")}),
}) {
ARROW_SCOPED_TRACE(never_true.ToString());
EXPECT_FALSE(Bind(never_true).IsSatisfiable());
// ... but it may appear in satisfiable filters if coalesced (for example, wrapped in
// fill_na)
EXPECT_TRUE(Bind(call("is_null", {never_true})).IsSatisfiable());
}
}

TEST(Expression, FieldsInExpression) {
Expand Down Expand Up @@ -846,6 +870,10 @@ TEST(Expression, FoldConstants) {
}),
literal(4));

// INTERSECTION null handling and null input -> null output
ExpectFoldsTo(call("equal", {field_ref("i32"), null_literal(int32())}),
null_literal(boolean()));

// nested call against literals with one field_ref
// (i32 - (2 * 3)) + 2 == (i32 - 6) + 2
// NB this could be improved further by using associativity of addition; another pass
Expand Down Expand Up @@ -1066,8 +1094,7 @@ TEST(Expression, CanonicalizeAnd) {
and_(and_(and_(and_(null_, null_), true_), b), c));

// catches and_kleene even when it's a subexpression
ExpectCanonicalizesTo(call("is_valid", {and_(b, true_)}),
call("is_valid", {and_(true_, b)}));
ExpectCanonicalizesTo(is_valid(and_(b, true_)), is_valid(and_(true_, b)));
}

TEST(Expression, CanonicalizeComparison) {
Expand Down Expand Up @@ -1279,13 +1306,89 @@ TEST(Expression, SimplifyWithGuarantee) {
.WithGuarantee(not_(equal(field_ref("i32"), literal(7))))
.Expect(equal(field_ref("i32"), literal(7)));

// In the absence of is_null(i32) we assume i32 is valid
Simplify{
is_null(field_ref("i32")),
}
.WithGuarantee(greater_equal(field_ref("i32"), literal(1)))
.Expect(false);

Simplify{
is_null(field_ref("i32")),
}
.WithGuarantee(
or_(greater_equal(field_ref("i32"), literal(1)), is_null(field_ref("i32"))))
.Expect(is_null(field_ref("i32")));

Simplify{
is_null(field_ref("i32")),
}
.WithGuarantee(
and_(greater_equal(field_ref("i32"), literal(1)), is_valid(field_ref("i32"))))
.Expect(false);

Simplify{
is_valid(field_ref("i32")),
}
.WithGuarantee(greater_equal(field_ref("i32"), literal(1)))
.Expect(true);

Simplify{
is_valid(field_ref("i32")),
}
.WithGuarantee(
or_(greater_equal(field_ref("i32"), literal(1)), is_null(field_ref("i32"))))
.Expect(is_valid(field_ref("i32")));

Simplify{
is_valid(field_ref("i32")),
}
.WithGuarantee(
and_(greater_equal(field_ref("i32"), literal(1)), is_valid(field_ref("i32"))))
.Expect(true);
}

TEST(Expression, SimplifyWithValidityGuarantee) {
Simplify{is_null(field_ref("i32"))}
.WithGuarantee(is_null(field_ref("i32")))
.Expect(literal(true));

Simplify{is_valid(field_ref("i32"))}
.WithGuarantee(is_null(field_ref("i32")))
.Expect(literal(false));

Simplify{is_valid(field_ref("i32"))}
.WithGuarantee(is_valid(field_ref("i32")))
.Expect(literal(true));

Simplify{is_valid(field_ref("i32"))}
.WithGuarantee(is_valid(field_ref("dict_i32"))) // different field
.Expect(is_valid(field_ref("i32")));

Simplify{is_null(field_ref("i32"))}
.WithGuarantee(is_valid(field_ref("i32")))
.Expect(literal(false));

Simplify{true_unless_null(field_ref("i32"))}
.WithGuarantee(is_valid(field_ref("i32")))
.Expect(literal(true));
}

TEST(Expression, SimplifyWithComparisonAndNullableCaveat) {
auto i32_is_2_or_null =
or_(equal(field_ref("i32"), literal(2)), is_null(field_ref("i32")));

Simplify{equal(field_ref("i32"), literal(2))}
.WithGuarantee(i32_is_2_or_null)
.Expect(true_unless_null(field_ref("i32")));

// XXX: needs a rule for 'true_unless_null(x) || is_null(x)'
// Simplify{i32_is_2_or_null}.WithGuarantee(i32_is_2_or_null).Expect(literal(true));

Simplify{equal(field_ref("i32"), literal(3))}
.WithGuarantee(i32_is_2_or_null)
.Expect(not_(
true_unless_null(field_ref("i32")))); // not satisfiable, will drop row group
}

TEST(Expression, SimplifyThenExecute) {
Expand Down
72 changes: 53 additions & 19 deletions cpp/src/arrow/compute/kernels/scalar_validity.cc
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,15 @@ struct IsValidOperator {
}

static Status Call(KernelContext* ctx, const ArrayData& arr, ArrayData* out) {
if (arr.type->id() == Type::NA) {
// Input is all nulls => output is entirely false.
ARROW_ASSIGN_OR_RAISE(out->buffers[1],
ctx->AllocateBitmap(out->length + out->offset));
bit_util::SetBitsTo(out->buffers[1]->mutable_data(), out->offset, out->length,
false);
return Status::OK();
}

DCHECK_EQ(out->offset, 0);
DCHECK_LE(out->length, arr.length);
if (arr.MayHaveNulls()) {
Expand Down Expand Up @@ -146,6 +155,29 @@ struct IsNullOperator {
}
};

struct TrueUnlessNullOperator {
static Status Call(KernelContext* ctx, const Scalar& in, Scalar* out) {
checked_cast<BooleanScalar*>(out)->is_valid = in.is_valid;
checked_cast<BooleanScalar*>(out)->value = true;
return Status::OK();
}

static Status Call(KernelContext* ctx, const ArrayData& arr, ArrayData* out) {
// NullHandling::INTERSECTION with a single input means the execution engine
// has already reused or allocated a null_bitmap which can be reused as the values
// buffer.
if (out->buffers[0]) {
out->buffers[1] = out->buffers[0];
} else {
// But for all-valid inputs, the engine will skip allocating a
// buffer; we have to allocate one ourselves
ARROW_ASSIGN_OR_RAISE(out->buffers[1], ctx->AllocateBitmap(arr.length));
std::memset(out->buffers[1]->mutable_data(), 0xFF, out->buffers[1]->size());
}
return Status::OK();
}
};

struct IsNanOperator {
template <typename OutType, typename InType>
static constexpr OutType Call(KernelContext*, const InType& value, Status*) {
Expand All @@ -156,14 +188,15 @@ struct IsNanOperator {
void MakeFunction(std::string name, const FunctionDoc* doc,
std::vector<InputType> in_types, OutputType out_type,
ArrayKernelExec exec, FunctionRegistry* registry,
MemAllocation::type mem_allocation, bool can_write_into_slices,
MemAllocation::type mem_allocation, NullHandling::type null_handling,
bool can_write_into_slices,
const FunctionOptions* default_options = NULLPTR,
KernelInit init = NULLPTR) {
Arity arity{static_cast<int>(in_types.size())};
auto func = std::make_shared<ScalarFunction>(name, arity, doc, default_options);

ScalarKernel kernel(std::move(in_types), out_type, exec, init);
kernel.null_handling = NullHandling::OUTPUT_NOT_NULL;
kernel.null_handling = null_handling;
kernel.can_write_into_slices = can_write_into_slices;
kernel.mem_allocation = mem_allocation;

Expand Down Expand Up @@ -247,21 +280,7 @@ std::shared_ptr<ScalarFunction> MakeIsNanFunction(std::string name,
}

Status IsValidExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
const Datum& arg0 = batch[0];
if (arg0.type()->id() == Type::NA) {
auto false_value = std::make_shared<BooleanScalar>(false);
if (arg0.kind() == Datum::SCALAR) {
out->value = false_value;
} else {
std::shared_ptr<Array> false_values;
RETURN_NOT_OK(MakeArrayFromScalar(*false_value, out->length(), ctx->memory_pool())
.Value(&false_values));
out->value = false_values->data();
}
return Status::OK();
} else {
return applicator::SimpleUnary<IsValidOperator>(ctx, batch, out);
}
return applicator::SimpleUnary<IsValidOperator>(ctx, batch, out);
}

Status IsNullExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
Expand All @@ -281,6 +300,10 @@ Status IsNullExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
}
}

Status TrueUnlessNullExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
return applicator::SimpleUnary<TrueUnlessNullOperator>(ctx, batch, out);
}

const FunctionDoc is_valid_doc(
"Return true if non-null",
("For each input value, emit true iff the value is valid (i.e. non-null)."),
Expand All @@ -303,6 +326,11 @@ const FunctionDoc is_null_doc(
"True may also be emitted for NaN values by setting the `nan_is_null` flag."),
{"values"}, "NullOptions");

const FunctionDoc true_unless_null_doc("Return true if non-null, else return null",
("For each input value, emit true iff the value\n"
"is valid (non-null), otherwise emit null."),
{"values"});

const FunctionDoc is_nan_doc("Return true if NaN",
("For each input value, emit true iff the value is NaN."),
{"values"});
Expand All @@ -312,12 +340,18 @@ const FunctionDoc is_nan_doc("Return true if NaN",
void RegisterScalarValidity(FunctionRegistry* registry) {
static auto kNullOptions = NullOptions::Defaults();
MakeFunction("is_valid", &is_valid_doc, {ValueDescr::ANY}, boolean(), IsValidExec,
registry, MemAllocation::NO_PREALLOCATE, /*can_write_into_slices=*/false);
registry, MemAllocation::NO_PREALLOCATE, NullHandling::OUTPUT_NOT_NULL,
/*can_write_into_slices=*/false);

MakeFunction("is_null", &is_null_doc, {ValueDescr::ANY}, boolean(), IsNullExec,
registry, MemAllocation::PREALLOCATE,
registry, MemAllocation::PREALLOCATE, NullHandling::OUTPUT_NOT_NULL,
/*can_write_into_slices=*/true, &kNullOptions, NanOptionsState::Init);

MakeFunction("true_unless_null", &true_unless_null_doc, {ValueDescr::ANY}, boolean(),
TrueUnlessNullExec, registry, MemAllocation::NO_PREALLOCATE,
NullHandling::INTERSECTION,
/*can_write_into_slices=*/false);

DCHECK_OK(registry->AddFunction(MakeIsFiniteFunction("is_finite", &is_finite_doc)));
DCHECK_OK(registry->AddFunction(MakeIsInfFunction("is_inf", &is_inf_doc)));
DCHECK_OK(registry->AddFunction(MakeIsNanFunction("is_nan", &is_nan_doc)));
Expand Down
19 changes: 19 additions & 0 deletions cpp/src/arrow/compute/kernels/scalar_validity_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,25 @@ TEST_F(TestBooleanValidityKernels, ArrayIsValid) {
"[false, true, true, false]");
}

TEST_F(TestBooleanValidityKernels, TrueUnlessNull) {
CheckScalarUnary("true_unless_null", type_singleton(), "[]", type_singleton(), "[]");
CheckScalarUnary("true_unless_null", type_singleton(), "[null]", type_singleton(),
"[null]");
CheckScalarUnary("true_unless_null", type_singleton(), "[0, 1]", type_singleton(),
"[true, true]");
CheckScalarUnary("true_unless_null", type_singleton(), "[null, 1, 0, null]",
type_singleton(), "[null, true, true, null]");
}

TEST_F(TestBooleanValidityKernels, IsValidIsNullNullType) {
CheckScalarUnary("is_null", std::make_shared<NullArray>(5),
ArrayFromJSON(boolean(), "[true, true, true, true, true]"));
CheckScalarUnary("is_valid", std::make_shared<NullArray>(5),
ArrayFromJSON(boolean(), "[false, false, false, false, false]"));
CheckScalarUnary("true_unless_null", std::make_shared<NullArray>(5),
ArrayFromJSON(boolean(), "[null, null, null, null, null]"));
}

TEST_F(TestBooleanValidityKernels, ArrayIsValidBufferPassthruOptimization) {
Datum arg = ArrayFromJSON(boolean(), "[null, 1, 0, null]");
ASSERT_OK_AND_ASSIGN(auto validity, arrow::compute::IsValid(arg));
Expand Down
1 change: 1 addition & 0 deletions cpp/src/arrow/dataset/file_csv_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -397,6 +397,7 @@ TEST_P(TestCsvFileFormatScan, ScanRecordBatchReaderWithVirtualColumn) {
TEST_P(TestCsvFileFormatScan, ScanRecordBatchReaderWithDuplicateColumnError) {
TestScanWithDuplicateColumnError();
}
TEST_P(TestCsvFileFormatScan, ScanWithPushdownNulls) { TestScanWithPushdownNulls(); }

INSTANTIATE_TEST_SUITE_P(TestScan, TestCsvFileFormatScan,
::testing::ValuesIn(TestFormatParams::Values()),
Expand Down
1 change: 1 addition & 0 deletions cpp/src/arrow/dataset/file_ipc_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,7 @@ TEST_P(TestIpcFileFormatScan, ScanRecordBatchReaderWithDuplicateColumn) {
TEST_P(TestIpcFileFormatScan, ScanRecordBatchReaderWithDuplicateColumnError) {
TestScanWithDuplicateColumnError();
}
TEST_P(TestIpcFileFormatScan, ScanWithPushdownNulls) { TestScanWithPushdownNulls(); }
TEST_P(TestIpcFileFormatScan, FragmentScanOptions) {
auto reader = GetRecordBatchReader(
// ARROW-12077: on Windows/mimalloc/release, nullable list column leads to crash
Expand Down
1 change: 1 addition & 0 deletions cpp/src/arrow/dataset/file_orc_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@ TEST_P(TestOrcFileFormatScan, ScanRecordBatchReaderWithDuplicateColumn) {
TEST_P(TestOrcFileFormatScan, ScanRecordBatchReaderWithDuplicateColumnError) {
TestScanWithDuplicateColumnError();
}
TEST_P(TestOrcFileFormatScan, ScanWithPushdownNulls) { TestScanWithPushdownNulls(); }
INSTANTIATE_TEST_SUITE_P(TestScan, TestOrcFileFormatScan,
::testing::ValuesIn(TestFormatParams::Values()),
TestFormatParams::ToTestNameString);
Expand Down
Loading