8000 Teach tuplesort.c about "top N" sorting, in which only the first N tu… · postgrespro/postgres@d26559d · GitHub
[go: up one dir, main page]

Skip to content

Commit d26559d

Browse files
committed
Teach tuplesort.c about "top N" sorting, in which only the first N tuples
need be returned. We keep a heap of the current best N tuples and sift-up new tuples into it as we scan the input. For M input tuples this means only about M*log(N) comparisons instead of M*log(M), not to mention a lot less workspace when N is small --- avoiding spill-to-disk for large M is actually the most attractive thing about it. Patch includes planner and executor support for invoking this facility in ORDER BY ... LIMIT queries. Greg Stark, with some editorialization by moi.
1 parent 0fef38d commit d26559d

File tree

13 files changed

+464
-58
lines changed
  • util
  • utils
  • include
  • 13 files changed

    +464
    -58
    lines changed

    src/backend/executor/nodeLimit.c

    Lines changed: 32 additions & 1 deletion
    Original file line numberDiff line numberDiff line change
    @@ -8,7 +8,7 @@
    88
    *
    99
    *
    1010
    * IDENTIFICATION
    11-
    * $PostgreSQL: pgsql/src/backend/executor/nodeLimit.c,v 1.29 2007/01/05 22:19:28 momjian Exp $
    11+
    * $PostgreSQL: pgsql/src/backend/executor/nodeLimit.c,v 1.30 2007/05/04 01:13:43 tgl Exp $
    1212
    *
    1313
    *-------------------------------------------------------------------------
    1414
    */
    @@ -280,6 +280,37 @@ recompute_limits(LimitState *node)
    280280
    /* Reset position to start-of-scan */
    281281
    node->position = 0;
    282282
    node->subSlot = NULL;
    283+
    284+
    /*
    285+
    * If we have a COUNT, and our input is a Sort node, notify it that it can
    286+
    * use bounded sort.
    287+
    *
    288+
    * This is a bit of a kluge, but we don't have any more-abstract way of
    289+
    * communicating between the two nodes; and it doesn't seem worth trying
    290+
    * to invent one without some more examples of special communication needs.
    291+
    *
    292+
    * Note: it is the responsibility of nodeSort.c to react properly to
    293+
    * changes of these parameters. If we ever do redesign this, it'd be
    294+
    * a good idea to integrate this signaling with the parameter-change
    295+
    * mechanism.
    296+
    */
    297+
    if (IsA(outerPlanState(node), SortState))
    298+
    {
    299+
    SortState *sortState = (SortState *) outerPlanState(node);
    300+
    int64 tuples_needed = node->count + node->offset;
    301+
    302+
    /* negative test checks for overflow */
    303+
    if (node->noCount || tuples_needed < 0)
    304+
    {
    305+
    /* make sure flag gets reset if needed upon rescan */
    306+
    sortState->bounded = false;
    307+
    }
    308+
    else
    309+
    {
    310+
    sortState->bounded = true;
    311+
    sortState->bound = tuples_needed;
    312+
    }
    313+
    }
    283314
    }
    284315

    285316
    /* ----------------------------------------------------------------

    src/backend/executor/nodeSort.c

    Lines changed: 10 additions & 2 deletions
    Original file line numberDiff line numberDiff line change
    @@ -8,7 +8,7 @@
    88
    *
    99
    *
    1010
    * IDENTIFICATION
    11-
    * $PostgreSQL: pgsql/src/backend/executor/nodeSort.c,v 1.60 2007/01/09 02:14:11 tgl Exp $
    11+
    * $PostgreSQL: pgsql/src/backend/executor/nodeSort.c,v 1.61 2007/05/04 01:13:44 tgl Exp $
    1212
    *
    1313
    *-------------------------------------------------------------------------
    1414
    */
    @@ -89,6 +89,8 @@ ExecSort(SortState *node)
    8989
    plannode->nullsFirst,
    9090
    work_mem,
    9191
    node->randomAccess);
    92+
    if (node->bounded)
    93+
    tuplesort_set_bound(tuplesortstate, node->bound);
    9294
    node->tuplesortstate = (void *) tuplesortstate;
    9395

    9496
    /*
    @@ -119,6 +121,8 @@ ExecSort(SortState *node)
    119121
    * finally set the sorted flag to true
    120122
    */
    121123
    node->sort_Done = true;
    124+
    node->bounded_Done = node->bounded;
    125+
    node->bound_Done = node->bound;
    122126
    SO1_printf("ExecSort: %s\n", "sorting done");
    123127
    }
    124128

    @@ -167,6 +171,7 @@ ExecInitSort(Sort *node, EState *estate, int eflags)
    167171
    EXEC_FLAG_BACKWARD |
    168172
    EXEC_FLAG_MARK)) != 0;
    169173

    174+
    sortstate->bounded = false;
    170175
    sortstate->sort_Done = false;
    171176
    sortstate->tuplesortstate = NULL;
    172177

    @@ -307,11 +312,14 @@ ExecReScanSort(SortState *node, ExprContext *exprCtxt)
    307312

    308313
    /*
    309314
    * If subnode is to be rescanned then we forget previous sort results; we
    310-
    * have to re-read the subplan and re-sort.
    315+
    * have to re-read the subplan and re-sort. Also must re-sort if the
    316+
    * bounded-sort parameters changed or we didn't select randomAccess.
    311317
    *
    312318
    * Otherwise we can just rewind and rescan the sorted output.
    313319
    */
    314320
    if (((PlanState *) node)->lefttree->chgParam != NULL ||
    321+
    node->bounded != node->bounded_Done ||
    322+
    node->bound != node->bound_Done ||
    315323
    !node->randomAccess)
    316324
    {
    317325
    node->sort_Done = false;

    src/backend/optimizer/path/costsize.c

    Lines changed: 60 additions & 17 deletions
    Original file line numberDiff line numberDiff line change
    @@ -54,7 +54,7 @@
    5454
    * Portions Copyright (c) 1994, Regents of the University of California
    5555
    *
    5656
    * IDENTIFICATION
    57-
    * $PostgreSQL: pgsql/src/backend/optimizer/path/costsize.c,v 1.181 2007/04/21 21:01:44 tgl Exp $
    57+
    * $PostgreSQL: pgsql/src/backend/optimizer/path/costsize.c,v 1.182 2007/05/04 01:13:44 tgl Exp $
    5858
    *
    5959
    *-------------------------------------------------------------------------
    6060
    */
    @@ -922,6 +922,10 @@ cost_valuesscan(Path *path, PlannerInfo *root, RelOptInfo *baserel)
    922922
    * disk traffic = 2 * relsize * ceil(logM(p / (2*work_mem)))
    923923
    * cpu = comparison_cost * t * log2(t)
    924924
    *
    925+
    * If the sort is bounded (i.e., only the first k result tuples are needed)
    926+
    * and k tuples can fit into work_mem, we use a heap method that keeps only
    927+
    * k tuples in the heap; this will require about t*log2(k) tuple comparisons.
    928+
    *
    925929
    * The disk traffic is assumed to be 3/4ths sequential and 1/4th random
    926930
    * accesses (XXX can't we refine that guess?)
    927931
    *
    @@ -932,6 +936,7 @@ cost_valuesscan(Path *path, PlannerInfo *root, RelOptInfo *baserel)
    932936
    * 'input_cost' is the total cost for reading the input data
    933937
    * 'tuples' is the number of tuples in the relation
    934938
    * 'width' is the average tuple width in bytes
    939+
    * 'limit_tuples' is the bound on the number of output tuples; -1 if no bound
    935940
    *
    936941
    * NOTE: some callers currently pass NIL for pathkeys because they
    937942
    * can't conveniently supply the sort keys. Since this routine doesn't
    @@ -942,11 +947,14 @@ cost_valuesscan(Path *path, PlannerInfo *root, RelOptInfo *baserel)
    942947
    */
    943948
    void
    944949
    cost_sort(Path *path, PlannerInfo *root,
    945-
    List *pathkeys, Cost input_cost, double tuples, int width)
    950+
    List *pathkeys, Cost input_cost, double tuples, int width,
    951+
    double limit_tuples)
    946952
    {
    947953
    Cost startup_cost = input_cost;
    94 EED3 8954
    Cost run_cost = 0;
    949-
    double nbytes = relation_byte_size(tuples, width);
    955+
    double input_bytes = relation_byte_size(tuples, width);
    956+
    double output_bytes;
    957+
    double output_tuples;
    950958
    long work_mem_bytes = work_mem * 1024L;
    951959

    952960
    if (!enable_sort)
    @@ -959,23 +967,39 @@ cost_sort(Path *path, PlannerInfo *root,
    959967
    if (tuples < 2.0)
    960968
    tuples = 2.0;
    961969

    962-
    /*
    963-
    * CPU costs
    964-
    *
    965-
    * Assume about two operator evals per tuple comparison and N log2 N
    966-
    * comparisons
    967-
    */
    968-
    startup_cost += 2.0 * cpu_operator_cost * tuples * LOG2(tuples);
    970+
    /* Do we have a useful LIMIT? */
    971+
    if (limit_tuples > 0 && limit_tuples < tuples)
    972+
    {
    973+
    output_tuples = limit_tuples;
    974+
    output_bytes = relation_byte_size(output_tuples, width);
    975+
    }
    976+
    else
    977+
    {
    978+
    output_tuples = tuples;
    979+
    output_bytes = input_bytes;
    980+
    }
    969981

    970-
    /* disk costs */
    971-
    if (nbytes > work_mem_bytes)
    982+
    if (output_bytes > work_mem_bytes)
    972983
    {
    973-
    double npages = ceil(nbytes / BLCKSZ);
    974-
    double nruns = (nbytes / work_mem_bytes) * 0.5;
    984+
    /*
    985+
    * We'll have to use a disk-based sort of all the tuples
    986+
    */
    987+
    double npages = ceil(input_bytes / BLCKSZ);
    988+
    double nruns = (input_bytes / work_mem_bytes) * 0.5;
    975989
    double mergeorder = tuplesort_merge_order(work_mem_bytes);
    976990
    double log_runs;
    977991
    double npageaccesses;
    978992

    993+
    /*
    994+
    * CPU costs
    995+
    *
    996+
    * Assume about two operator evals per tuple comparison and N log2 N
    997+
    * comparisons
    998+
    */
    999+
    startup_cost += 2.0 * cpu_operator_cost * tuples * LOG2(tuples);
    1000+
    1001+
    /* Disk costs */
    1002+
    9791003
    /* Compute logM(r) as log(r) / log(M) */
    9801004
    if (nruns > mergeorder)
    9811005
    log_runs = ceil(log(nruns) / log(mergeorder));
    @@ -986,10 +1010,27 @@ cost_sort(Path *path, PlannerInfo *root,
    9861010
    startup_cost += npageaccesses *
    9871011
    (seq_page_cost * 0.75 + random_page_cost * 0.25);
    9881012
    }
    1013+
    else if (tuples > 2 * output_tuples || input_bytes > work_mem_bytes)
    1014+
    {
    1015+
    /*
    1016+
    * We'll use a bounded heap-sort keeping just K tuples in memory,
    1017+
    * for a total number of tuple comparisons of N log2 K; but the
    1018+
    * constant factor is a bit higher than for quicksort. Tweak it
    1019+
    * so that the cost curve is continuous at the crossover point.
    1020+
    */
    1021+
    startup_cost += 2.0 * cpu_operator_cost * tuples * LOG2(2.0 * output_tuples);
    1022+
    }
    1023+
    else
    1024+
    {
    1025+
    /* We'll use plain quicksort on all the input tuples */
    1026+
    startup_cost += 2.0 * cpu_operator_cost * tuples * LOG2(tuples);
    1027+
    }
    9891028

    9901029
    /*
    9911030
    * Also charge a small amount (arbitrarily set equal to operator cost) per
    992-
    * extracted tuple.
    1031+
    * extracted tuple. Note it's correct to use tuples not output_tuples
    1032+
    * here --- the upper LIMIT will pro-rate the run cost so we'd be double
    1033+
    * counting the LIMIT otherwise.
    9931034
    */
    9941035
    run_cost += cpu_operator_cost * tuples;
    9951036

    @@ -1431,7 +1472,8 @@ cost_mergejoin(MergePath *path, PlannerInfo *root)
    14311472
    outersortkeys,
    14321473
    outer_path->total_cost,
    14331474
    outer_path_rows,
    1434-
    outer_path->parent->width);
    1475+
    outer_path->parent->width,
    1476+
    -1.0);
    14351477
    startup_cost += sort_path.startup_cost;
    14361478
    run_cost += (sort_path.total_cost - sort_path.startup_cost)
    14371479
    * outerscansel;
    @@ -1450,7 +1492,8 @@ cost_mergejoin(MergePath *path, PlannerInfo *root)
    14501492
    innersortkeys,
    14511493
    inner_path->total_cost,
    14521494
    inner_path_rows,
    1453-
    inner_path->parent->width);
    1495+
    inner_path->parent->width,
    1496+
    -1.0);
    14541497
    startup_cost += sort_path.startup_cost;
    14551498
    run_cost += (sort_path.total_cost - sort_path.startup_cost)
    14561499
    * innerscansel * rescanratio;

    src/backend/optimizer/plan/createplan.c

    Lines changed: 20 additions & 11 deletions
    Original file line numberDiff line numberDiff line change
    @@ -10,7 +10,7 @@
    1010
    *
    1111
    *
    1212
    * IDENTIFICATION
    13-
    * $PostgreSQL: pgsql/src/backend/optimizer/plan/createplan.c,v 1.229 2007/04/21 21:01:44 tgl Exp $
    13+
    * $PostgreSQL: pgsql/src/backend/optimizer/plan/createplan.c,v 1.230 2007/05/04 01:13:44 tgl Exp $
    1414
    *
    1515
    *-------------------------------------------------------------------------
    1616
    */
    @@ -122,7 +122,8 @@ static MergeJoin *make_mergejoin(List *tlist,
    122122
    Plan *lefttree, Plan *righttree,
    123123
    JoinType jointype);
    124124
    static Sort *make_sort(PlannerInfo *root, Plan *lefttree, int numCols,
    125-
    AttrNumber *sortColIdx, Oid *sortOperators, bool *nullsFirst);
    125+
    AttrNumber *sortColIdx, Oid *sortOperators, bool *nullsFirst,
    126+
    double limit_tuples);
    126127

    127128

    128129
    /*
    @@ -1579,7 +1580,8 @@ create_mergejoin_plan(PlannerInfo *root,
    15791580
    outer_plan = (Plan *)
    15801581
    make_sort_from_pathkeys(root,
    15811582
    outer_plan,
    1582-
    best_path->outersortkeys);
    1583+
    best_path->outersortkeys,
    1584+
    -1.0);
    15831585
    outerpathkeys = best_path->outersortkeys;
    15841586
    }
    15851587
    else
    @@ -1591,7 +1593,8 @@ create_mergejoin_plan(PlannerInfo *root,
    15911593
    inner_plan = (Plan *)
    15921594
    make_sort_from_pathkeys(root,
    15931595
    inner_plan,
    1594-
    best_path->innersortkeys);
    1596+
    best_path->innersortkeys,
    1597+
    -1.0);
    15951598
    innerpathkeys = best_path->innersortkeys;
    15961599
    }
    15971600
    else
    @@ -2589,11 +2592,13 @@ make_mergejoin(List *tlist,
    25892592
    * make_sort --- basic routine to build a Sort plan node
    25902593
    *
    25912594
    * Caller must have built the sortColIdx, sortOperators, and nullsFirst
    2592-
    * arrays already.
    2595+
    * arrays already. limit_tuples is as for cost_sort (in particular, pass
    2596+
    * -1 if no limit)
    25932597
    */
    25942598
    static Sort *
    25952599
    make_sort(PlannerInfo *root, Plan *lefttree, int numCols,
    2596-
    AttrNumber *sortColIdx, Oid *sortOperators, bool *nullsFirst)
    2600+
    AttrNumber *sortColIdx, Oid *sortOperators, bool *nullsFirst,
    2601+
    double limit_tuples)
    25972602
    {
    25982603
    Sort *node = makeNode(Sort);
    25992604
    Plan *plan = &node->plan;
    @@ -2603,7 +2608,8 @@ make_sort(PlannerInfo *root, Plan *lefttree, int numCols,
    26032608
    cost_sort(&sort_path, root, NIL,
    26042609
    lefttree->total_cost,
    26052610
    lefttree->plan_rows,
    2606-
    lefttree->plan_width);
    2611+
    lefttree->plan_width,
    2612+
    limit_tuples);
    26072613
    plan->startup_cost = sort_path.startup_cost;
    26082614
    plan->total_cost = sort_path.total_cost;
    26092615
    plan->targetlist = lefttree->targetlist;
    @@ -2664,6 +2670,8 @@ add_sort_column(AttrNumber colIdx, Oid sortOp, bool nulls_first,
    26642670
    *
    26652671
    * 'lefttree' is the node which yields input tuples
    26662672
    * 'pathkeys' is the list of pathkeys by which the result is to be sorted
    2673+
    * 'limit_tuples' is the bound on the number of output tuples;
    2674+
    * -1 if no bound
    26672675
    *
    26682676
    * We must convert the pathkey information into arrays of sort key column
    26692677
    * numbers and sort operator OIDs.
    @@ -2675,7 +2683,8 @@ add_sort_column(AttrNumber colIdx, Oid sortOp, bool nulls_first,
    26752683
    * adding a Result node just to do the projection.
    26762684
    */
    26772685
    Sort *
    2678-
    make_sort_from_pathkeys(PlannerInfo *root, Plan *lefttree, List *pathkeys)
    2686+
    make_sort_from_pathkeys(PlannerInfo *root, Plan *lefttree, List *pathkeys,
    2687+
    double limit_tuples)
    26792688
    {
    26802689
    List *tlist = lefttree->targetlist;
    26812690
    ListCell *i;
    @@ -2810,7 +2819,7 @@ make_sort_from_pathkeys(PlannerInfo *root, Plan *lefttree, List *pathkeys)
    28102819
    Assert(numsortkeys > 0);
    28112820

    28122821
    return make_sort(root, lefttree, numsortkeys,
    2813-
    sortColIdx, sortOperators, nullsFirst);
    2822+
    sortColIdx, sortOperators, nullsFirst, limit_tuples);
    28142823
    }
    28152824

    28162825
    /*
    @@ -2859,7 +2868,7 @@ make_sort_from_sortclauses(PlannerInfo *root, List *sortcls, Plan *lefttree)
    28592868
    Assert(numsortkeys > 0);
    28602869

    28612870
    return make_sort(root, lefttree, numsortkeys,
    2862-
    sortColIdx, sortOperators, nullsFirst);
    2871+
    sortColIdx, sortOperators, nullsFirst, -1.0);
    28632872
    }
    28642873

    28652874
    /*
    @@ -2919,7 +2928,7 @@ make_sort_from_groupcols(PlannerInfo *root,
    29192928
    Assert(numsortkeys > 0);
    29202929

    29212930
    return make_sort(root, lefttree, numsortkeys,
    2922-
    sortColIdx, sortOperators, nullsFirst);
    2931+
    sortColIdx, sortOperators, nullsFirst, -1.0);
    29232932
    }
    29242933
    29252934
    Material *

    src/backend/optimizer/plan/planmain.c

    Lines changed: 10 additions & 3 deletions
    Original file line numberDiff line numberDiff line change
    @@ -14,7 +14,7 @@
    1414
    *
    1515
    *
    1616
    * IDENTIFICATION
    17-
    * $PostgreSQL: pgsql/src/backend/optimizer/plan/planmain.c,v 1.100 2007/04/21 21:01:45 tgl Exp $
    17+
    * $PostgreSQL: pgsql/src/backend/optimizer/plan/planmain.c,v 1.101 2007/05/04 01:13:44 tgl Exp $
    1818
    *
    1919
    *-------------------------------------------------------------------------
    2020
    */
    @@ -47,6 +47,8 @@
    4747
    * tlist is the target list the query should produce
    4848
    * (this is NOT necessarily root->parse->targetList!)
    4949
    * tuple_fraction is the fraction of tuples we expect will be retrieved
    50+
    * limit_tuples is a hard limit on number of tuples to retrieve,
    51+
    * or -1 if no limit
    5052
    *
    5153
    * Output parameters:
    5254
    * *cheapest_path receives the overall-cheapest path for the query
    @@ -74,9 +76,13 @@
    7476
    * from the plan to be retrieved
    7577
    * tuple_fraction >= 1: tuple_fraction is the absolute number of tuples
    7678
    * expected to be retrieved (ie, a LIMIT specification)
    79+
    * Note that a nonzero tuple_fraction could come from outer context; it is
    80+
    * therefore not redundant with limit_tuples. We use limit_tuples to determine
    81+
    * whether a bounded sort can be used at runtime.
    7782
    */
    7883
    void
    79-
    query_planner(PlannerInfo *root, List *tlist, double tuple_fraction,
    84+
    query_planner(PlannerInfo *root, List *tlist,
    85+
    double tuple_fraction, double limit_tuples,
    8086
    Path **cheapest_path, Path **sorted_path,
    8187
    double *num_groups)
    8288
    {
    @@ -354,7 +360,8 @@ query_planner(PlannerInfo *root, List *tlist, double tuple_fraction,
    354360
    /* Figure cost for sorting */
    355361
    cost_sort(&sort_path, root, root->query_pathkeys,
    356362
    cheapestpath->total_cost,
    357-
    final_rel->rows, final_rel->width);
    363+
    final_rel->rows, final_rel->width,
    364+
    limit_tuples);
    358365
    }
    359366

    360367
    if (compare_fractional_path_costs(sortedpath, &sort_path,

    0 commit comments

    Comments
     (0)
    0