8000 Properly prepare varinfos in estimate_multivariate_bucketsize() · petergeoghegan/postgres@9f404d7 · GitHub
[go: up one dir, main page]

Skip to content

Commit 9f404d7

Browse files
committed
Properly prepare varinfos in estimate_multivariate_bucketsize()
To estimate with extended statistics, we need to clear the varnullingrels field in the expression, and duplicates are not allowed in the GroupVarInfo list. We might re-use add_unique_group_var(), but we don't do so for two reasons. 1) We must keep the origin_rinfos list ordered exactly the same way as varinfos. 2) add_unique_group_var() is designed for estimate_num_groups(), where a larger number of groups is worse. While estimating the number of hash buckets, we have the opposite: a lesser number of groups is worse. Therefore, we don't have to remove "known equal" vars: the removed var may valuably contribute to the multivariate statistics to grow the number of groups. This commit adds custom code to estimate_multivariate_bucketsize() to initialize varinfos properly. Reported-by: Robins Tharakan <tharakan@gmail.com> Discussion: https://postgr.es/m/18885-da51324078588253%40postgresql.org Author: Andrei Lepikhov <lepihov@gmail.com> Reviewed-by: David Rowley <dgrowleyml@gmail.com> Reviewed-by: Tomas Vondra <tomas@vondra.me> Reviewed-by: Alexander Korotkov <aekorotkov@gmail.com>
1 parent 3db61db commit 9f404d7

File tree

3 files changed

+96
-3
lines changed

3 files changed

+96
-3
lines changed

src/backend/utils/adt/selfuncs.c

Lines changed: 57 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3850,6 +3850,8 @@ estimate_multivariate_bucketsize(PlannerInfo *root, RelOptInfo *inner,
38503850
if (bms_get_singleton_member(relids, &relid) &&
38513851
root->simple_rel_array[relid]->statlist != NIL)
38523852
{
3853+
bool is_duplicate = false;
3854+
38533855
/*
38543856
* This inner-side expression references only one relation.
38553857
* Extended statistics on this clause can exist.
@@ -3880,11 +3882,61 @@ estimate_multivariate_bucketsize(PlannerInfo *root, RelOptInfo *inner,
38803882
*/
38813883
continue;
38823884

3883-
varinfo = (GroupVarInfo *) palloc(sizeof(GroupVarInfo));
3885+
/*
3886+
* We're going to add the new clause to the varinfos list. We
3887+
* might re-use add_unique_group_var(), but we don't do so for
3888+
* two reasons.
3889+
*
3890+
* 1) We must keep the origin_rinfos list ordered exactly the
3891+
* same way as varinfos.
3892+
*
3893+
* 2) add_unique_group_var() is designed for
3894+
* estimate_num_groups(), where a larger number of groups is
3895+
* worse. While estimating the number of hash buckets, we
3896+
* have the opposite: a lesser number of groups is worse.
3897+
* Therefore, we don't have to remove "known equal" vars: the
3898+
* removed var may valuably contribute to the multivariate
3899+
* statistics to grow the number of groups.
3900+
*/
3901+
3902+
/*
3903+
* Clear nullingrels to correctly match hash keys. See
3904+
* add_unique_group_var()'s comment for details.
3905+
*/
3906+
expr = remove_nulling_relids(expr, root->outer_join_rels, NULL);
3907+
3908+
/*
3909+
* Detect and exclude exact duplicates from the list of hash
3910+
* keys (like add_unique_group_var does).
3911+
*/
3912+
foreach(lc1, varinfos)
3913+
{
3914+
varinfo = (GroupVarInfo *) lfirst(lc1);
3915+
3916+
if (!equal(expr, varinfo->var))
3917+
continue;
3918+
3919+
is_duplicate = true;
3920+
break;
3921+
}
3922+
3923+
if (is_duplicate)
3924+
{
3925+
/*
3926+
* Skip exact duplicates. Adding them to the otherclauses
3927+
* list also doesn't make sense.
3928+
*/
3929+
continue;
3930+
}
3931+
3932+
/*
3933+
* Initialize GroupVarInfo. We only use it to call
3934+
* estimate_multivariate_ndistinct(), which doesn't care about
3935+
* ndistinct and isdefault fields. Thus, skip these fields.
3936+
*/
3937+
varinfo = (GroupVarInfo *) palloc0(sizeof(GroupVarInfo));
38843938
varinfo->var = expr;
38853939
varinfo->rel = root->simple_rel_array[relid];
3886-
varinfo->ndistinct = 0.0;
3887-
varinfo->isdefault = false;
38883940
varinfos = lappend(varinfos, varinfo);
38893941

38903942
/*
@@ -3894,8 +3946,10 @@ estimate_multivariate_bucketsize(PlannerInfo *root, RelOptInfo *inner,
38943946
origin_rinfos = lappend(origin_rinfos, rinfo);
38953947
}
38963948
else
3949+
{
38973950
/* This clause can't be estimated with extended statistics */
38983951
otherclauses = lappend(otherclauses, rinfo);
3952+
}
38993953

39003954
clauses = foreach_delete_current(clauses, lc);
39013955
}

src/test/regress/expected/stats_ext.out

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3427,4 +3427,32 @@ SELECT * FROM sb_1 a, sb_2 b WHERE a.x = b.x AND a.y = b.y AND a.z = b.z;
34273427
-> Seq Scan on sb_2 b
34283428
(5 rows)
34293429

3430+
-- Check that the Hash Join bucket size estimator detects equal clauses correctly.
3431+
SET enable_nestloop = 'off';
3432+
SET enable_mergejoin = 'off';
3433+
EXPLAIN (COSTS OFF)
3434+
SELECT FROM sb_1 LEFT JOIN sb_2 ON (sb_2.x=sb_1.x) AND (sb_1.x=sb_2.x);
3435+
QUERY PLAN
3436+
--------------------------------------------------------
3437+
Hash Left Join
3438+
Hash Cond: ((sb_1.x = sb_2.x) AND (sb_1.x = sb_2.x))
3439+
-> Seq Scan on sb_1
3440+
-> Hash
3441+
-> Seq Scan on sb_2
3442+
(5 rows)
3443+
3444+
EXPLAIN (COSTS OFF)
3445+
SELECT FROM sb_1 LEFT JOIN sb_2
3446+
ON (sb_2.x=sb_1.x) AND (sb_1.x=sb_2.x) AND (sb_1.y=sb_2.y);
3447+
QUERY PLAN
3448+
------------------------------------------------------------------------------
3449+
Hash Left Join
3450+
Hash Cond: ((sb_1.x = sb_2.x) AND (sb_1.y = sb_2.y) AND (sb_1.x = sb_2.x))
3451+
-> Seq Scan on sb_1
3452+
-> Hash
3453+
-> Seq Scan on sb_2
3454+
(5 rows)
3455+
3456+
RESET enable_nestloop;
3457+
RESET enable_mergejoin;
34303458
DROP TABLE sb_1, sb_2 CASCADE;

src/test/regress/sql/stats_ext.sql

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1747,4 +1747,15 @@ ANALYZE sb_2;
17471747
EXPLAIN (COSTS OFF) -- Choose hash join
17481748
SELECT * FROM sb_1 a, sb_2 b WHERE a.x = b.x AND a.y = b.y AND a.z = b.z;
17491749

1750+
-- Check that the Hash Join bucket size estimator detects equal clauses correctly.
1751+
SET enable_nestloop = 'off';
1752+
SET enable_mergejoin = 'off';
1753+
EXPLAIN (COSTS OFF)
1754+
SELECT FROM sb_1 LEFT JOIN sb_2 ON (sb_2.x=sb_1.x) AND (sb_1.x=sb_2.x);
1755+
EXPLAIN (COSTS OFF)
1756+
SELECT FROM sb_1 LEFT JOIN sb_2
1757+
ON (sb_2.x=sb_1.x) AND (sb_1.x=sb_2.x) AND (sb_1.y=sb_2.y);
1758+
RESET enable_nestloop;
1759+
RESET enable_mergejoin;
1760+
17501761
DROP TABLE sb_1, sb_2 CASCADE;

0 commit comments

Comments
 (0)
10F
0