8000 Enable use of Memoize for ANTI joins · sthagen/postgres-postgres@0da29e4 · GitHub
[go: up one dir, main page]

Skip to content

Commit 0da29e4

Browse files
author
Richard Guo
committed
Enable use of Memoize for ANTI joins
Currently, we do not support Memoize for SEMI and ANTI joins because nested loop SEMI/ANTI joins do not scan the inner relation to completion, which prevents Memoize from marking the cache entry as complete. One might argue that we could mark the cache entry as complete after fetching the first inner tuple, but that would not be safe: if the first inner tuple and the current outer tuple do not satisfy the join clauses, a second inner tuple matching the parameters would find the cache entry already marked as complete. However, if the inner side is provably unique, this issue doesn't arise, since there would be no second matching tuple. That said, this doesn't help in the case of SEMI joins, because a SEMI join with a provably unique inner side would already have been reduced to an inner join by reduce_unique_semijoins. Therefore, in this patch, we check whether the inner relation is provably unique for ANTI joins and enable the use of Memoize in such cases. Author: Richard Guo <guofenglinux@gmail.com> Reviewed-by: wenhui qiu <qiuwenhuifx@gmail.com> Reviewed-by: Andrei Lepikhov <lepihov@gmail.com> Discussion: https://postgr.es/m/CAMbWs48FdLiMNrmJL-g6mDvoQVt0yNyJAqMkv4e2Pk-5GKCZLA@mail.gmail.com
1 parent 7b2eb72 commit 0da29e4

File tree

3 files changed

+112
-22
lines changed

3 files changed

+112
-22
lines changed

src/backend/optimizer/path/joinpath.c

Lines changed: 25 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -154,13 +154,17 @@ add_paths_to_joinrel(PlannerInfo *root,
154154
/*
155155
* See if the inner relation is provably unique for this outer rel.
156156
*
157-
* We have some special cases: for JOIN_SEMI and JOIN_ANTI, it doesn't
158-
* matter since the executor can make the equivalent optimization anyway;
159-
* we need not expend planner cycles on proofs. For JOIN_UNIQUE_INNER, we
160-
* must be considering a semijoin whose inner side is not provably unique
161-
* (else reduce_unique_semijoins would've simplified it), so there's no
162-
* point in calling innerrel_is_unique. However, if the LHS covers all of
163-
* the semijoin's min_lefthand, then it's appropriate to set inner_unique
157+
* We have some special cases: for JOIN_SEMI, it doesn't matter since the
158+
* executor can make the equivalent optimization anyway. It also doesn't
159+
* help enable use of Memoize, since a semijoin with a provably unique
160+
* inner side should have been reduced to an inner join in that case.
161+
* Therefore, we need not expend planner cycles on proofs. (For
162+
* JOIN_ANTI, although it doesn't help the executor for the same reason,
163+
* it can benefit Memoize paths.) For JOIN_UNIQUE_INNER, we must be
164+
* considering a semijoin whose inner side is not provably unique (else
165+
* reduce_unique_semijoins would've simplified it), so there's no point in
166+
* calling innerrel_is_unique. However, if the LHS covers all of the
167+
* semijoin's min_lefthand, then it's appropriate to set inner_unique
164168
* because the path produced by create_unique_path will be unique relative
165169
* to the LHS. (If we have an LHS that's only part of the min_lefthand,
166170
* that is *not* true.) For JOIN_UNIQUE_OUTER, pass JOIN_INNER to avoid
@@ -169,12 +173,6 @@ add_paths_to_joinrel(PlannerInfo *root,
169173
switch (jointype)
170174
{
171175
case JOIN_SEMI:
172-
case JOIN_ANTI:
173-
174-
/*
175-
* XXX it may be worth proving this to allow a Memoize to be
176-
* considered for Nested Loop Semi/Anti Joins.
177-
*/
178176
extra.inner_unique = false; /* well, unproven */
179177
break;
180178
case JOIN_UNIQUE_INNER:
@@ -715,16 +713,21 @@ get_memoize_path(PlannerInfo *root, RelOptInfo *innerrel,
715713
return NULL;
716714

717715
/*
718-
* Currently we don't do this for SEMI and ANTI joins unless they're
719-
* marked as inner_unique. This is because nested loop SEMI/ANTI joins
720-
* don't scan the inner node to completion, which will mean memoize cannot
721-
* mark the cache entry as complete.
722-
*
723-
* XXX Currently we don't attempt to mark SEMI/ANTI joins as inner_unique
724-
* = true. Should we? See add_paths_to_joinrel()
716+
* Currently we don't do this for SEMI and ANTI joins, because nested loop
717+
* SEMI/ANTI joins don't scan the inner node to completion, which means
718+
* memoize cannot mark the cache entry as complete. Nor can we mark the
719+
* cache entry as complete after fetching the first inner tuple, because
720+
* if that tuple and the current outer tuple don't satisfy the join
721+
* clauses, a second inner tu 10000 ple that satisfies the parameters would find
722+
* the cache entry already marked as complete. The only exception is when
723+
* the inner relation is provably unique, as in that case, there won't be
724+
* a second matching tuple and we can safely mark the cache entry as
725+
* complete after fetching the first inner tuple. Note that in such
726+
* cases, the SEMI join should have been reduced to an inner join by
727+
* reduce_unique_semijoins.
725728
*/
726-
if (!extra->inner_unique && (jointype == JOIN_SEMI ||
727-
jointype == JOIN_ANTI))
729+
if ((jointype == JOIN_SEMI || jointype == JOIN_ANTI) &&
730+
!extra->inner_unique)
728731
return NULL;
729732

730733
/*

src/test/regress/expected/memoize.out

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ begin
2525
ln := regexp_replace(ln, 'Heap Fetches: \d+', 'Heap Fetches: N');
2626
ln := regexp_replace(ln, 'loops=\d+', 'loops=N');
2727
ln := regexp_replace(ln, 'Index Searches: \d+', 'Index Searches: N');
28+
ln := regexp_replace(ln, 'Memory: \d+kB', 'Memory: NkB');
2829
return next ln;
2930
end loop;
3031
end;
@@ -500,3 +501,62 @@ RESET max_parallel_workers_per_gather;
500501
RESET parallel_tuple_cost;
501502
RESET parallel_setup_cost;
502503
RESET min_parallel_table_scan_size;
504+
-- Ensure memoize works for ANTI joins
505+
CREATE TABLE tab_anti (a int, b boolean);
506+
INSERT INTO tab_anti SELECT i%3, false FROM generate_series(1,100)i;
507+
ANALYZE tab_anti;
508+
-- Ensure we get a Memoize plan for ANTI join
509+
SELECT explain_memoize('
510+
SELECT COUNT(*) FROM tab_anti t1 LEFT JOIN
511+
LATERAL (SELECT DISTINCT ON (a) a, b, t1.a AS x FROM tab_anti t2) t2
512+
ON t1.a+1 = t2.a
513+
WHERE t2.a IS NULL;', false);
514+
explain_memoize
515+
--------------------------------------------------------------------------------------------
516+
Aggregate (actual rows=1.00 loops=N)
517+
-> Nested Loop Anti Join (actual rows=33.00 loops=N)
518+
-> Seq Scan on tab_anti t1 (actual rows=100.00 loops=N)
519+
-> Memoize (actual rows=0.67 loops=N)
520+
Cache Key: (t1.a + 1), t1.a
521+
Cache Mode: binary
522+
Hits: 97 Misses: 3 Evictions: Zero Overflows: 0 Memory Usage: NkB
523+
-> Subquery Scan on t2 (actual rows=0.67 loops=N)
524+
Filter: ((t1.a + 1) = t2.a)
525+
Rows Removed by Filter: 2
526+
-> Unique (actual rows=2.67 loops=N)
527+
-> Sort (actual rows=67.33 loops=N)
528+
Sort Key: t2_1.a
529+
Sort Method: quicksort Memory: NkB
530+
-> Seq Scan on tab_anti t2_1 (actual rows=100.00 loops=N)
531+
(15 rows)
532+
533+
-- And check we get the expected results.
534+
SELECT COUNT(*) FROM tab_anti t1 LEFT JOIN
535+
LATERAL (SELECT DISTINCT ON (a) a, b, t1.a AS x FROM tab_anti t2) t2
536+
ON t1.a+1 = t2.a
537+
WHERE t2.a IS NULL;
538+
count
539+
-------
540+
33
541+
(1 row)
542+
543+
-- Ensure we do not add memoize node for SEMI join
544+
EXPLAIN (COSTS OFF)
545+
SELECT * FROM tab_anti t1 WHERE t1.a IN
546+
(SELECT a FROM tab_anti t2 WHERE t2.b IN
547+
(SELECT t1.b FROM tab_anti t3 WHERE t2.a > 1 OFFSET 0));
548+
QUERY PLAN
549+
-------------------------------------------------
550+
Nested Loop Semi Join
551+
-> Seq Scan on tab_anti t1
552+
-> Nested Loop Semi Join
553+
Join Filter: (t1.a = t2.a)
554+
-> Seq Scan on tab_anti t2
555+
-> Subquery Scan on "ANY_subquery"
556+
Filter: (t2.b = "ANY_subquery".b)
557+
-> Result
558+
One-Time Filter: (t2.a > 1)
559+
-> Seq Scan on tab_anti t3
560+
(10 rows)
561+
562+
DROP TABLE tab_anti;

src/test/regress/sql/memoize.sql

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ begin
2626
ln := regexp_replace(ln, 'Heap Fetches: \d+', 'Heap Fetches: N');
2727
ln := regexp_replace(ln, 'loops=\d+', 'loops=N');
2828
ln := regexp_replace(ln, 'Index Searches: \d+', 'Index Searches: N');
29+
ln := regexp_replace(ln, 'Memory: \d+kB', 'Memory: NkB');
2930
return next ln;
3031
end loop;
3132
end;
@@ -244,3 +245,29 @@ RESET max_parallel_workers_per_gather;
244245
RESET parallel_tuple_cost;
245246
RESET parallel_setup_cost;
246247
RESET min_parallel_table_scan_size;
248+
249+
-- Ensure memoize works for ANTI joins
250+
CREATE TABLE tab_anti (a int, b boolean);
251+
INSERT INTO tab_anti SELECT i%3, false FROM generate_series(1,100)i;
252+
ANALYZE tab_anti;
253+
254+
-- Ensure we get a Memoize plan for ANTI join
255+
SELECT explain_memoize('
256+
SELECT COUNT(*) FROM tab_anti t1 LEFT JOIN
257+
LATERAL (SELECT DISTINCT ON (a) a, b, t1.a AS x FROM tab_anti t2) t2
258+
ON t1.a+1 = t2.a
259+
WHERE t2.a IS NULL;', false);
260+
261+
-- And check we get the expected results.
262+
SELECT COUNT(*) FROM tab_anti t1 LEFT JOIN
263+
LATERAL (SELECT DISTINCT ON (a) a, b, t1.a AS x FROM tab_anti t2) t2
264+
ON t1.a+1 = t2.a
265+
WHERE t2.a IS NULL;
266+
267+
-- Ensure we do not add memoize node for SEMI join
268+
EXPLAIN (COSTS OFF)
269+
SELECT * FROM tab_anti t1 WHERE t1.a IN
270+
(SELECT a FROM tab_anti t2 WHERE t2.b IN
271+
(SELECT t1.b FROM tab_anti t3 WHERE t2.a > 1 OFFSET 0));
272+
273+
DROP TABLE tab_anti;

0 commit comments

Comments
 (0)
0