8000 Improve relation width estimation for subqueries. · postgrespro/postgres@0f61d4d · GitHub
[go: up one dir, main page]

Skip to content
  • Commit 0f61d4d

    Browse files
    committed
    Improve relation width estimation for subqueries.
    As per the ancient comment for set_rel_width, it really wasn't much good for relations that aren't plain tables: it would never find any stats and would always fall back on datatype-based estimates, which are often pretty silly. Fix that by copying up width estimates from the subquery planning process. At some point we might want to do this for CTEs too, but that would be a significantly more invasive patch because the sub-PlannerInfo is no longer accessible by the time it's needed. I refrained from doing anything about that, partly for fear of breaking the unmerged CTE-related patches. In passing, also generate less bogus width estimates for whole-row Vars. Per a gripe from Jon Nelson.
    1 parent fe24d78 commit 0f61d4d

    File tree

    6 files changed

    +150
    -20
    lines changed

    6 files changed

    +150
    -20
    lines changed

    src/backend/optimizer/path/allpaths.c

    Lines changed: 1 addition & 4 deletions
    Original file line numberDiff line numberDiff line change
    @@ -758,11 +758,8 @@ set_subquery_pathlist(PlannerInfo *root, RelOptInfo *rel,
    758758
    rel->subrtable = subroot->parse->rtable;
    759759
    rel->subrowmark = subroot->rowMarks;
    760760

    761-
    /* Copy number of output rows from subplan */
    762-
    rel->tuples = rel->subplan->plan_rows;
    763-
    764761
    /* Mark rel with estimated output rows, width, etc */
    765-
    set_baserel_size_estimates(root, rel);
    762+
    set_subquery_size_estimates(root, rel, subroot);
    766763

    767764
    /* Convert subquery pathkeys to outer representation */
    768765
    pathkeys = convert_subquery_pathkeys(root, rel, subroot->query_pathkeys);

    src/backend/optimizer/path/costsize.c

    Lines changed: 129 additions & 8 deletions
    Original file line numberDiff line numberDiff line change
    @@ -76,6 +76,7 @@
    7676
    #include "optimizer/cost.h"
    7777
    #include "optimizer/pathnode.h"
    7878
    #include "optimizer/placeholder.h"
    79+
    #include "optimizer/plancat.h"
    7980
    #include "optimizer/planmain.h"
    8081
    #include "optimizer/restrictinfo.h"
    8182
    #include "parser/parsetree.h"
    @@ -2986,7 +2987,7 @@ approx_tuple_count(PlannerInfo *root, JoinPath *path, List *quals)
    29862987
    * Set the size estimates for the given base relation.
    29872988
    *
    29882989
    * The rel's targetlist and restrictinfo list must have been constructed
    2989-
    * already.
    2990+
    * already, and rel->tuples must be set.
    29902991
    *
    29912992
    * We set the following fields of the rel node:
    29922993
    * rows: the estimated number of output tuples (after applying
    @@ -3151,6 +3152,76 @@ set_joinrel_size_estimates(PlannerInfo *root, RelOptInfo *rel,
    31513152
    rel->rows = clamp_row_est(nrows);
    31523153
    }
    31533154

    3155+
    /*
    3156+
    * set_subquery_size_estimates
    3157+
    * Set the size estimates for a base relation that is a subquery.
    3158+
    *
    3159+
    * The rel's targetlist and restrictinfo list must have been constructed
    3160+
    * already, and the plan for the subquery must have been completed.
    3161+
    * We look at the subquery's plan and PlannerInfo to extract data.
    3162+
    *
    3163+
    * We set the same fields as set_baserel_size_estimates.
    3164+
    */
    3165+
    void
    3166+
    set_subquery_size_estimates(PlannerInfo *root, RelOptInfo *rel,
    3167+
    PlannerInfo *subroot)
    3168+
    {
    3169+
    RangeTblEntry *rte;
    3170+
    ListCell *lc;
    3171+
    3172+
    /* Should only be applied to base relations that are subqueries */
    3173+
    Assert(rel->relid > 0);
    3174+
    rte = planner_rt_fetch(rel->relid, root);
    3175+
    Assert(rte->rtekind == RTE_SUBQUERY);
    3176+
    3177+
    /* Copy raw number of output rows from subplan */
    3178+
    rel->tuples = rel->subplan->plan_rows;
    3179+
    3180+
    /*
    3181+
    * Compute per-output-column width estimates by examining the subquery's
    3182+
    * targetlist. For any output that is a plain Var, get the width estimate
    3183+
    * that was made while planning the subquery. Otherwise, fall back on a
    3184+
    * datatype-based estimate.
    3185+
    */
    3186+
    foreach(lc, subroot->parse->targetList)
    3187+
    {
    3188+
    TargetEntry *te = (TargetEntry *) lfirst(lc);
    3189+
    Node *texpr = (Node *) te->expr;
    3190+
    int32 item_width;
    3191+
    3192+
    Assert(IsA(te, TargetEntry));
    3193+
    /* junk columns aren't visible to upper query */
    3194+
    if (te->resjunk)
    3195+
    continue;
    3196+
    3197+
    /*
    3198+
    * XXX This currently doesn't work for subqueries containing set
    3199+
    * operations, because the Vars in their tlists are bogus references
    3200+
    * to the first leaf subquery, which wouldn't give the right answer
    3201+
    * even if we could still get to its PlannerInfo. So fall back on
    3202+
    * datatype in that case.
    3203+
    */
    3204+
    if (IsA(texpr, Var) &&
    3205+
    subroot->parse->setOperations == NULL)
    3206+
    {
    3207+
    Var *var = (Var *) texpr;
    3208+
    RelOptInfo *subrel = find_base_rel(subroot, var->varno);
    3209+
    3210+
    item_width = subrel->attr_widths[var->varattno - subrel->min_attr];
    3211+
    }
    3212+
    else
    3213+
    {
    3214+
    item_width = get_typavgwidth(exprType(texpr), exprTypmod(texpr));
    3215+
    }
    3216+
    Assert(item_width > 0);
    3217+
    Assert(te->resno >= rel->min_attr && te->resno <= rel->max_attr);
    3218+
    rel->attr_widths[te->resno - rel->min_attr] = item_width;
    3219+
    }
    3220+
    3221+
    /* Now estimate number of output rows, etc */
    3222+
    set_baserel_size_estimates(root, rel);
    3223+
    }
    3224+
    31543225
    /*
    31553226
    * set_function_size_estimates
    31563227
    * Set the size estimates for a base relation that is a function call.
    @@ -3251,11 +3322,17 @@ set_cte_size_estimates(PlannerInfo *root, RelOptInfo *rel, Plan *cteplan)
    32513322
    * set_rel_width
    32523323
    * Set the estimated output width of a base relation.
    32533324
    *
    3325+
    * The estimated output width is the sum of the per-attribute width estimates
    3326+
    * for the actually-referenced columns, plus any PHVs or other expressions
    3327+
    * that have to be calculated at this relation. This is the amount of data
    3328+
    * we'd need to pass upwards in case of a sort, hash, etc.
    3329+
    *
    32543330
    * NB: this works best on plain relations because it prefers to look at
    3255-
    * real Vars. It will fail to make use of pg_statistic info when applied
    3256-
    * to a subquery relation, even if the subquery outputs are simple vars
    3257-
    * that we could have gotten info for. Is it worth trying to be smarter
    3258-
    * about subqueries?
    3331+
    * real Vars. For subqueries, set_subquery_size_estimates will already have
    3332+
    * copied up whatever per-column estimates were made within the subquery,
    3333+
    * and for other types of rels there isn't much we can do anyway. We fall
    3334+
    * back on (fairly stupid) datatype-based width estimates if we can't get
    3335+
    * any better number.
    32593336
    *
    32603337
    * The per-attribute width estimates are cached for possible re-use while
    32613338
    * building join relations.
    @@ -3265,6 +3342,7 @@ set_rel_width(PlannerInfo *root, RelOptInfo *rel)
    32653342
    {
    32663343
    Oid reloid = planner_rt_fetch(rel->relid, root)->relid;
    32673344
    int32 tuple_width = 0;
    3345+
    bool have_wholerow_var = false;
    32683346
    ListCell *lc;
    32693347

    32703348
    foreach(lc, rel->reltargetlist)
    @@ -3284,8 +3362,18 @@ set_rel_width(PlannerInfo *root, RelOptInfo *rel)
    32843362
    ndx = var->varattno - rel->min_attr;
    32853363

    32863364
    /*
    3287-
    * The width probably hasn't been cached yet, but may as well
    3288-
    * check
    3365+
    * If it's a whole-row Var, we'll deal with it below after we
    3366+
    * have already cached as many attr widths as possible.
    3367+
    */
    3368+
    if (var->varattno == 0)
    3369+
    {
    3370+
    have_wholerow_var = true;
    3371+
    continue;
    3372+
    }
    3373+
    3374+
    /*
    3375+
    * The width may have been cached already (especially if it's
    3376+
    * a subquery), so don't duplicate effort.
    32893377
    */
    32903378
    if (rel->attr_widths[ndx] > 0)
    32913379
    {
    @@ -3294,7 +3382,7 @@ set_rel_width(PlannerInfo *root, RelOptInfo *rel)
    32943382
    }
    32953383

    32963384
    /* Try to get column width from statistics */
    3297-
    if (reloid != InvalidOid)
    3385+
    if (reloid != InvalidOid && var->varattno > 0)
    32983386
    {
    32993387
    item_width = get_attavgwidth(reloid, var->varattno);
    33003388
    if (item_width > 0)
    @@ -3335,6 +3423,39 @@ set_rel_width(PlannerInfo *root, RelOptInfo *rel)
    33353423
    tuple_width += item_width;
    33363424
    }
    33373425
    }
    3426+
    3427+
    /*
    3428+
    * If we have a whole-row reference, estimate its width as the sum of
    3429+
    * per-column widths plus sizeof(HeapTupleHeaderData).
    3430+
    */
    3431+
    if (have_wholerow_var)
    3432+
    {
    3433+
    int32 wholerow_width = sizeof(HeapTupleHeaderData);
    3434+
    3435+
    if (reloid != InvalidOid)
    3436+
    {
    3437+
    /* Real relation, so estimate true tuple width */
    3438+
    wholerow_width += get_relation_data_width(reloid,
    3439+
    rel->attr_widths - rel->min_attr);
    3440+
    }
    3441+
    else
    3442+
    {
    3443+
    /* Do what we can with info for a phony rel */
    3444+
    AttrNumber i;
    3445+
    3446+
    for (i = 1; i <= rel->max_attr; i++)
    3447+
    wholerow_width += rel->attr_widths[i - rel->min_attr];
    3448+
    }
    3449+
    3450+
    rel->attr_widths[0 - rel->min_attr] = wholerow_width;
    3451+
    3452+
    /*
    3453+
    * Include the whole-row Var as part of the output tuple. Yes,
    3454+
    * that really is what happens at runtime.
    3455+
    */
    3456+
    tuple_width += wholerow_width;
    3457+
    }
    3458+
    33383459
    Assert(tuple_width >= 0);
    33393460
    rel->width = tuple_width;
    33403461
    }

    src/backend/optimizer/plan/planner.c

    Lines changed: 1 addition & 1 deletion
    Original file line numberDiff line numberDiff line change
    @@ -3102,7 +3102,7 @@ plan_cluster_use_sort(Oid tableOid, Oid indexOid)
    31023102
    * set_baserel_size_estimates, just do a quick hack for rows and width.
    31033103
    */
    31043104
    rel->rows = rel->tuples;
    3105-
    rel->width = get_relation_data_width(tableOid);
    3105+
    rel->width = get_relation_data_width(tableOid, NULL);
    31063106

    31073107
    root->total_table_pages = rel->pages;
    31083108

    src/backend/optimizer/util/plancat.c

    Lines changed: 16 additions & 6 deletions
    Original file line numberDiff line numberDiff line change
    @@ -322,7 +322,7 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent,
    322322
    * estimate_rel_size - estimate # pages and # tuples in a table or index
    323323
    *
    324324
    * If attr_widths isn't NULL, it points to the zero-index entry of the
    325-
    * relation's attr_width[] cache; we fill this in if we have need to compute
    325+
    * relation's attr_widths[] cache; we fill this in if we have need to compute
    326326
    * the attribute widths for estimation purposes.
    327327
    */
    328328
    void
    @@ -435,8 +435,9 @@ estimate_rel_size(Relation rel, int32 *attr_widths,
    435435
    * get_rel_data_width
    436436
    *
    437437
    * Estimate the average width of (the data part of) the relation's tuples.
    438-
    * If attr_widths isn't NULL, also store per-column width estimates into
    439-
    * that array.
    438+
    *
    439+
    * If attr_widths isn't NULL, it points to the zero-index entry of the
    440+
    * relation's attr_widths[] cache; use and update that cache as appropriate.
    440441
    *
    441442
    * Currently we ignore dropped columns. Ideally those should be included
    442443
    * in the result, but we haven't got any way to get info about them; and
    @@ -456,6 +457,14 @@ get_rel_data_width(Relation rel, int32 *attr_widths)
    456457

    457458
    if (att->attisdropped)
    458459
    continue;
    460+
    461+
    /* use previously cached data, if any */
    462+
    if (attr_widths != NULL && attr_widths[i] > 0)
    463+
    {
    464+
    tuple_width += attr_widths[i];
    465+
    continue;
    466+
    }
    467+
    459468
    /* This should match set_rel_width() in costsize.c */
    460469
    item_width = get_attavgwidth(RelationGetRelid(rel), i);
    461470
    if (item_width <= 0)
    @@ -474,18 +483,19 @@ get_rel_data_width(Relation rel, int32 *attr_widths)
    474483
    /*
    475484
    * get_relation_data_width
    476485
    *
    477-
    * External API for get_rel_data_width
    486+
    * External API for get_rel_data_width: same behavior except we have to
    487+
    * open the relcache entry.
    478488
    */
    479489
    int32
    480-
    get_relation_data_width(Oid relid)
    490+
    get_relation_data_width(Oid relid, int32 *attr_widths)
    481491
    {
    482492
    int32 result;
    483493
    Relation relation;
    484494

    485495
    /* As above, assume relation is already locked */
    486496
    relation = heap_open(relid, NoLock);
    487497

    488-
    result = get_rel_data_width(relation, NULL);
    498+
    result = get_rel_data_width(relation, attr_widths);
    489499

    490500
    heap_close(relation, NoLock);
    491501

    src/include/optimizer/cost.h

    Lines changed: 2 additions & 0 deletions
    Original file line numberDiff line numberDiff line change
    @@ -121,6 +121,8 @@ extern void set_joinrel_size_estimates(PlannerInfo *root, RelOptInfo *rel,
    121121
    RelOptInfo *inner_rel,
    122122
    SpecialJoinInfo *sjinfo,
    123123
    List *restrictlist);
    124+
    extern void set_subquery_size_estimates(PlannerInfo *root, RelOptInfo *rel,
    125+
    PlannerInfo *subroot);
    124126
    extern void set_function_size_estimates(PlannerInfo *root, RelOptInfo *rel);
    125127
    extern void set_values_size_estimates(PlannerInfo *root, RelOptInfo *rel);
    126128
    extern void set_cte_size_estimates(PlannerInfo *root, RelOptInfo *rel,

    src/include/optimizer/plancat.h

    Lines changed: 1 addition & 1 deletion
    Original file line numberDiff line numberDiff line change
    @@ -31,7 +31,7 @@ extern void get_relation_info(PlannerInfo *root, Oid relationObjectId,
    3131
    extern void estimate_rel_size(Relation rel, int32 *attr_widths,
    3232
    BlockNumber *pages, double *tuples);
    3333

    34-
    extern int32 get_relation_data_width(Oid relid);
    34+
    extern int32 get_relation_data_width(Oid relid, int32 *attr_widths);
    3535

    3636
    extern bool relation_excluded_by_constraints(PlannerInfo *root,
    3737
    RelOptInfo *rel, RangeTblEntry *rte);

    0 commit comments

    Comments
     (0)
    0