Skip to content

Wrong results of Left Anti Semi (Not-In) Join in nestloop and hashjoin #50

@my-ship-it

Description

@my-ship-it
--
-- Test left anti semi (not-in) join
-- Join on the distribution key of both sides.
--
begin;
\pset null '<NULL>'
create table t1_lasj(c1 int) distributed by (c1);
create table t2_lasj_has_null(c1n int) distributed by (c1n);
insert into t1_lasj values (generate_series (1,10));
insert into t2_lasj_has_null values (1), (2), (3), (null), (5), (6), (7);
analyze t1_lasj;
analyze t2_lasj_has_null;
select c1n from t2_lasj_has_null where c1n is null or c1n > 0;
  c1n   
--------
      2
      3
 <NULL>
      7
      1
      5
      6
(7 rows)

-- Hash left anti semi (not-in) join
explain(costs off) select c1 from t1_lasj where c1 not in (select c1n from t2_lasj_has_null where c1n is null or c1n > 0) and c1 is not null;
                        QUERY PLAN                        
----------------------------------------------------------
 Gather Motion 3:1  (slice1; segments: 3)
   ->  Hash Left Anti Semi (Not-In) Join
         Hash Cond: (t1_lasj.c1 = t2_lasj_has_null.c1n)
         ->  Seq Scan on t1_lasj
               Filter: (c1 IS NOT NULL)
         ->  Hash
               ->  Seq Scan on t2_lasj_has_null
                     Filter: ((c1n IS NULL) OR (c1n > 0))
 Optimizer: Postgres query optimizer
(9 rows)

select c1 from t1_lasj where c1 not in (select c1n from t2_lasj_has_null where c1n is null or c1n > 0) and c1 is not null;
 c1 
----
  9
 10
(2 rows)

set local enable_hashjoin = off;
set local enable_nestloop = on;
-- Nested loop left anti semi (not-in) join
explain(costs off) select c1 from t1_lasj where c1 not in (select c1n from t2_lasj_has_null where c1n is null or c1n > 0) and c1 is not null;
                        QUERY PLAN                        
----------------------------------------------------------
 Gather Motion 3:1  (slice1; segments: 3)
   ->  Nested Loop Left Anti Semi (Not-In) Join
         Join Filter: (t1_lasj.c1 = t2_lasj_has_null.c1n)
         ->  Seq Scan on t1_lasj
               Filter: (c1 IS NOT NULL)
         ->  Materialize
               ->  Seq Scan on t2_lasj_has_null
                     Filter: ((c1n IS NULL) OR (c1n > 0))
 Optimizer: Postgres query optimizer
(9 rows)

select c1 from t1_lasj where c1 not in (select c1n from t2_lasj_has_null where c1n is null or c1n > 0) and c1 is not null;
 c1 
----
  9
 10
(2 rows)

reset enable_hashjoin;
reset enable_nestloop;
\pset null ''
abort;

ORCA correct results:

--
-- Test left anti semi (not-in) join
-- Join on the distribution key of both sides.
--
begin;
\pset null '<NULL>'
create table t1_lasj(c1 int) distributed by (c1);
create table t2_lasj_has_null(c1n int) distributed by (c1n);
insert into t1_lasj values (generate_series (1,10));
insert into t2_lasj_has_null values (1), (2), (3), (null), (5), (6), (7);
analyze t1_lasj;
analyze t2_lasj_has_null;
select c1n from t2_lasj_has_null where c1n is null or c1n > 0;
  c1n   
--------
      2
      3
 <NULL>
      7
      5
      6
      1
(7 rows)

-- Hash left anti semi (not-in) join
explain(costs off) select c1 from t1_lasj where c1 not in (select c1n from t2_lasj_has_null where c1n is null or c1n > 0) and c1 is not null;
                           QUERY PLAN                           
----------------------------------------------------------------
 Gather Motion 3:1  (slice1; segments: 3)
   ->  Hash Left Anti Semi (Not-In) Join
         Hash Cond: (t1_lasj.c1 = t2_lasj_has_null.c1n)
         ->  Seq Scan on t1_lasj
               Filter: (NOT (c1 IS NULL))
         ->  Hash
               ->  Broadcast Motion 3:3  (slice2; segments: 3)
                     ->  Seq Scan on t2_lasj_has_null
                           Filter: ((c1n IS NULL) OR (c1n > 0))
 Optimizer: Pivotal Optimizer (GPORCA)
(10 rows)

select c1 from t1_lasj where c1 not in (select c1n from t2_lasj_has_null where c1n is null or c1n > 0) and c1 is not null;
 c1 
----
(0 rows)

set local enable_hashjoin = off;
set local enable_nestloop = on;
-- Nested loop left anti semi (not-in) join
explain(costs off) select c1 from t1_lasj where c1 not in (select c1n from t2_lasj_has_null where c1n is null or c1n > 0) and c1 is not null;
                           QUERY PLAN                           
----------------------------------------------------------------
 Gather Motion 3:1  (slice1; segments: 3)
   ->  Hash Left Anti Semi (Not-In) Join
         Hash Cond: (t1_lasj.c1 = t2_lasj_has_null.c1n)
         ->  Seq Scan on t1_lasj
               Filter: (NOT (c1 IS NULL))
         ->  Hash
               ->  Broadcast Motion 3:3  (slice2; segments: 3)
                     ->  Seq Scan on t2_lasj_has_null
                           Filter: ((c1n IS NULL) OR (c1n > 0))
 Optimizer: Pivotal Optimizer (GPORCA)
(10 rows)

select c1 from t1_lasj where c1 not in (select c1n from t2_lasj_has_null where c1n is null or c1n > 0) and c1 is not null;
 c1 
----
(0 rows)

reset enable_hashjoin;
reset enable_nestloop;
\pset null ''
abort;

See GPDB: https://github.com/greenplum-db/gpdb/issues/15662.

Metadata

Metadata

Labels

No labels
No labels

Type

No type

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions