8000 Increase number of hash join buckets for underestimate. · postgrespro/postgres@30d7ae3 · GitHub
[go: up one dir, main page]

Skip to content

Commit 30d7ae3

Browse files
committed
Increase number of hash join buckets for underestimate.
If we expect batching at the very beginning, we size nbuckets for "full work_mem" (see how many tuples we can get into work_mem, while not breaking NTUP_PER_BUCKET threshold). If we expect to be fine without batching, we start with the 'right' nbuckets and track the optimal nbuckets as we go (without actually resizing the hash table). Once we hit work_mem (considering the optimal nbuckets value), we keep the value. At the end of the first batch, we check whether (nbuckets != nbuckets_optimal) and resize the hash table if needed. Also, we keep this value for all batches (it's OK because it assumes full work_mem, and it makes the batchno evaluation trivial). So the resize happens only once. There could be cases where it would improve performance to allow the NTUP_PER_BUCKET threshold to be exceeded to keep everything in one batch rather than spilling to a second batch, but attempts to generate such a case have so far been unsuccessful; that issue may be addressed with a follow-on patch after further investigation. Tomas Vondra with minor format and comment cleanup by me Reviewed by Robert Haas, Heikki Linnakangas, and Kevin Grittner
1 parent 494affb commit 30d7ae3

File tree

3 files changed

+141
-6
lines changed
  • src
    • < 8000 div class="PRIVATE_TreeView-item-level-line prc-TreeView-TreeViewItemLevelLine-KPSSL">
      backend
  • include/executor
  • 3 files changed

    +141
    -6
    lines changed

    src/backend/commands/explain.c

    Lines changed: 7 additions & 4 deletions
    Original file line numberDiff line numberDiff line change
    @@ -1901,18 +1901,21 @@ show_hash_info(HashState *hashstate, ExplainState *es)
    19011901
    if (es->format != EXPLAIN_FORMAT_TEXT)
    19021902
    {
    19031903
    Ex 8000 plainPropertyLong("Hash Buckets", hashtable->nbuckets, es);
    1904+
    ExplainPropertyLong("Original Hash Buckets",
    1905+
    hashtable->nbuckets_original, es);
    19041906
    ExplainPropertyLong("Hash Batches", hashtable->nbatch, es);
    19051907
    ExplainPropertyLong("Original Hash Batches",
    19061908
    hashtable->nbatch_original, es);
    19071909
    ExplainPropertyLong("Peak Memory Usage", spacePeakKb, es);
    19081910
    }
    1909-
    else if (hashtable->nbatch_original != hashtable->nbatch)
    1911+
    else if ((hashtable->nbatch_original != hashtable->nbatch) ||
    1912+
    (hashtable->nbuckets_original != hashtable->nbuckets))
    19101913
    {
    19111914
    appendStringInfoSpaces(es->str, es->indent * 2);
    19121915
    appendStringInfo(es->str,
    1913-
    "Buckets: %d Batches: %d (originally %d) Memory Usage: %ldkB\n",
    1914-
    hashtable->nbuckets, hashtable->nbatch,
    1915-
    hashtable->nbatch_original, spacePeakKb);
    1916+
    "Buckets: %d (originally %d) Batches: %d (originally %d) Memory Usage: %ldkB\n",
    1917+
    hashtable->nbuckets, hashtable->nbuckets_original,
    1918+
    hashtable->nbatch, hashtable->nbatch_original, spacePeakKb);
    19161919
    }
    19171920
    else
    19181921
    {

    src/backend/executor/nodeHash.c

    Lines changed: 129 additions & 2 deletions
    Original file line numberDiff line numberDiff line change
    @@ -39,6 +39,7 @@
    3939

    4040

    4141
    static void ExecHashIncreaseNumBatches(HashJoinTable hashtable);
    42+
    static void ExecHashIncreaseNumBuckets(HashJoinTable hashtable);
    4243
    static void ExecHashBuildSkewHash(HashJoinTable hashtable, Hash *node,
    4344
    int mcvsToUse);
    4445
    static void ExecHashSkewTableInsert(HashJoinTable hashtable,
    @@ -117,6 +118,7 @@ MultiExecHash(HashState *node)
    117118
    /* It's a skew tuple, so put it into that hash table */
    118119
    ExecHashSkewTableInsert(hashtable, slot, hashvalue,
    119120
    bucketNumber);
    121+
    hashtable->skewTuples += 1;
    120122
    }
    121123
    else
    122124
    {
    @@ -127,6 +129,25 @@ MultiExecHash(HashState *node)
    127129
    }
    128130
    }
    129131

    132+
    /* resize the hash table if needed (NTUP_PER_BUCKET exceeded) */
    133+
    if (hashtable->nbuckets != hashtable->nbuckets_optimal)
    134+
    {
    135+
    /* We never decrease the number of buckets. */
    136+
    Assert(hashtable->nbuckets_optimal > hashtable->nbuckets);
    137+
    138+
    #ifdef HJDEBUG
    139+
    printf("Increasing nbuckets %d => %d\n",
    140+
    hashtable->nbuckets, hashtable->nbuckets_optimal);
    141+
    #endif
    142+
    143+
    ExecHashIncreaseNumBuckets(hashtable);
    144+
    }
    145+
    146+
    /* Account for the buckets in spaceUsed (reported in EXPLAIN ANALYZE) */
    147+
    hashtable->spaceUsed += hashtable->nbuckets * sizeof(HashJoinTuple);
    148+
    if (hashtable->spaceUsed > hashtable->spacePeak)
    149+
    hashtable->spacePeak = hashtable->spaceUsed;
    150+
    130151
    /* must provide our own instrumentation support */
    131152
    if (node->ps.instrument)
    132153
    InstrStopNode(node->ps.instrument, hashtable->totalTuples);
    @@ -272,7 +293,10 @@ ExecHashTableCreate(Hash *node, List *hashOperators, bool keepNulls)
    272293
    */
    273294
    hashtable = (HashJoinTable) palloc(sizeof(HashJoinTableData));
    274295
    hashtable->nbuckets = nbuckets;
    296+
    hashtable->nbuckets_original = nbuckets;
    297+
    hashtable->nbuckets_optimal = nbuckets;
    275298
    hashtable->log2_nbuckets = log2_nbuckets;
    299+
    hashtable->log2_nbuckets_optimal = log2_nbuckets;
    276300
    hashtable->buckets = NULL;
    277301
    hashtable->keepNulls = keepNulls;
    278302
    hashtable->skewEnabled = false;
    @@ -286,6 +310,7 @@ ExecHashTableCreate(Hash *node, List *hashOperators, bool keepNulls)
    286310
    hashtable->nbatch_outstart = nbatch;
    287311
    hashtable->growEnabled = true;
    288312
    hashtable->totalTuples = 0;
    313+
    hashtable->skewTuples = 0;
    289314
    hashtable->innerBatchFile = NULL;
    290315
    hashtable->outerBatchFile = NULL;
    291316
    hashtable->spaceUsed = 0;
    @@ -620,6 +645,19 @@ ExecHashIncreaseNumBatches(HashJoinTable hashtable)
    620645
    */
    621646
    ninmemory = nfreed = 0;
    622647

    648+
    /* If know we need to resize nbuckets, we can do it while rebatching. */
    649+
    if (hashtable->nbuckets_optimal != hashtable->nbuckets)
    650+
    {
    651+
    /* we never decrease the number of buckets */
    652+
    Assert(hashtable->nbuckets_optimal > hashtable->nbuckets);
    653+
    654+
    hashtable->nbuckets = hashtable->nbuckets_optimal;
    655+
    hashtable->log2_nbuckets = hashtable->log2_nbuckets_optimal;
    656+
    657+
    hashtable->buckets = repalloc(hashtable->buckets,
    658+
    sizeof(HashJoinTuple) * hashtable->nbuckets);
    659+
    }
    660+
    623661
    /*
    624662
    * We will scan through the chunks directly, so that we can reset the
    625663
    * buckets now and not have to keep track which tuples in the buckets have
    @@ -703,6 +741,78 @@ ExecHashIncreaseNumBatches(HashJoinTable hashtable)
    703741
    }
    704742
    }
    705743

    744+
    /*
    745+
    * ExecHashIncreaseNumBuckets
    746+
    * increase the original number of buckets in order to reduce
    747+
    * number of tuples per bucket
    748+
    */
    749+
    static void
    750+
    ExecHashIncreaseNumBuckets(HashJoinTable hashtable)
    751+
    {
    752+
    HashMemoryChunk chunk;
    753+
    754+
    /* do nothing if not an increase (it's called increase for a reason) */
    755+
    if (hashtable->nbuckets >= hashtable->nbuckets_optimal)
    756+
    return;
    757+
    758+
    /*
    759+
    * We already know the optimal number of buckets, so let's just
    760+
    * compute the log2_nbuckets for it.
    761+
    */
    762+
    hashtable->nbuckets = hashtable->nbuckets_optimal;
    763+
    hashtable->log2_nbuckets = my_log2(hashtable->nbuckets_optimal);
    764+
    765+
    Assert(hashtable->nbuckets > 1);
    766+
    Assert(hashtable->nbuckets <= (INT_MAX / 2));
    767+
    Assert(hashtable->nbuckets == (1 << hashtable->log2_nbuckets));
    768+
    769+
    #ifdef HJDEBUG
    770+
    printf("Increasing nbuckets to %d\n", hashtable->nbuckets);
    771+
    #endif
    772+
    773+
    /*
    774+
    * Just reallocate the proper number of buckets - we don't need to
    775+
    * walk through them - we can walk the dense-allocated chunks
    776+
    * (just like in ExecHashIncreaseNumBatches, but without all the
    777+
    * copying into new chunks)
    778+
    */
    779+
    hashtable->buckets =
    780+
    (HashJoinTuple *) repalloc(hashtable->buckets,
    781+
    hashtable->nbuckets * sizeof(HashJoinTuple));
    782+
    783+
    memset(hashtable->buckets, 0, sizeof(void *) * hashtable->nbuckets);
    784+
    785+
    /* scan through all tuples in all chunks to rebuild the hash table */
    < 10000 code>786+
    for (chunk = hashtable->chunks; chunk != NULL; chunk = chunk->next)
    787+
    {
    788+
    /* process all tuples stored in this chunk */
    789+
    size_t idx = 0;
    790+
    while (idx < chunk->used)
    791+
    {
    792+
    HashJoinTuple hashTuple = (HashJoinTuple) (chunk->data + idx);
    793+
    int bucketno;
    794+
    int batchno;
    795+
    796+
    ExecHashGetBucketAndBatch(hashtable, hashTuple->hashvalue,
    797+
    &bucketno, &batchno);
    798+
    799+
    /* add the tuple to the proper bucket */
    800+
    hashTuple->next = hashtable->buckets[bucketno];
    801+
    hashtable->buckets[bucketno] = hashTuple;
    802+
    803+
    /* advance index past the tuple */
    804+
    idx += MAXALIGN(HJTUPLE_OVERHEAD +
    805+
    HJTUPLE_MINTUPLE(hashTuple)->t_len);
    806+
    }
    807+
    }
    808+
    809+
    #ifdef HJDEBUG
    810+
    printf("Nbuckets increased to %d, average items per bucket %.1f\n",
    811+
    hashtable->nbuckets, batchTuples / hashtable->nbuckets);
    812+
    #endif
    813+
    }
    814+
    815+
    706816
    /*
    707817
    * ExecHashTableInsert
    708818
    * insert a tuple into the hash table depending on the hash value
    @@ -736,6 +846,7 @@ ExecHashTableInsert(HashJoinTable hashtable,
    736846
    */
    737847
    HashJoinTuple hashTuple;
    738848
    int hashTupleSize;
    849+
    double ntuples = (hashtable->totalTuples - hashtable->skewTuples);
    739850

    740851
    /* Create the HashJoinTuple */
    741852
    hashTupleSize = HJTUPLE_OVERHEAD + tuple->t_len;
    @@ -756,11 +867,24 @@ ExecHashTableInsert(HashJoinTable hashtable,
    756867
    hashTuple->next = hashtable->buckets[bucketno];
    757868
    hashtable->buckets[bucketno] = hashTuple;
    758869

    870+
    /*
    871+
    * Increase the (optimal) number of buckets if we just exceeded the
    872+
    * NTUP_PER_BUCKET threshold, but only when there's still a single batch.
    873+
    */
    874+
    if ((hashtable->nbatch == 1) &&
    875+
    (hashtable->nbuckets_optimal <= INT_MAX/2) && /* overflow protection */
    876+
    (ntuples >= (hashtable->nbuckets_optimal * NTUP_PER_BUCKET)))
    877+
    {
    878+
    hashtable->nbuckets_optimal *= 2;
    879+
    hashtable->log2_nbuckets_optimal += 1;
    880+
    }
    881+
    759882
    /* Account for space used, and back off if we've used too much */
    760883
    hashtable->spaceUsed += hashTupleSize;
    761884
    if (hashtable->spaceUsed > hashtable->spacePeak)
    762885
    hashtable->spacePeak = hashtable->spaceUsed;
    763-
    if (hashtable->spaceUsed + hashtable->nbuckets * sizeof(HashJoinTuple)
    886+
    if (hashtable->spaceUsed +
    887+
    hashtable->nbuckets_optimal * sizeof(HashJoinTuple)
    764888
    > hashtable->spaceAllowed)
    765889
    ExecHashIncreaseNumBatches(hashtable);
    766890
    }
    @@ -885,7 +1009,10 @@ ExecHashGetHashValue(HashJoinTable hashtable,
    8851009
    * functions are good about randomizing all their output bits, else we are
    8861010
    * likely to have very skewed bucket or batch occupancy.)
    8871011
    *
    888-
    * nbuckets doesn't change over the course of the join.
    1012+
    * nbuckets and log2_nbuckets may change while nbatch == 1 because of dynamic
    1013+
    * bucket count growth. Once we start batching, the value is fixed and does
    1014+
    * not change over the course of the join (making it possible to compute batch
    1015+
    * number the way we do here).
    8891016
    *
    8901017
    * nbatch is always a power of 2; we increase it only by doubling it. This
    8911018
    * effectively adds one more bit to the top of the batchno.

    src/include/executor/hashjoin.h

    Lines changed: 5 additions & 0 deletions
    Original file line numberDiff line numberDiff line change
    @@ -127,6 +127,10 @@ typedef struct HashJoinTableData
    127127
    int nbuckets; /* # buckets in the in-memory hash table */
    128128
    int log2_nbuckets; /* its log2 (nbuckets must be a power of 2) */
    129129

    130+
    int nbuckets_original; /* # buckets when starting the first hash */
    131+
    int nbuckets_optimal; /* optimal # buckets (per batch) */
    132+
    int log2_nbuckets_optimal; /* same as log2_nbuckets opti 77A4 mal */
    133+
    130134
    /* buckets[i] is head of list of tuples in i'th in-memory bucket */
    131135
    struct HashJoinTupleData **buckets;
    132136
    /* buckets array is per-batch storage, as are all the tuples */
    @@ -148,6 +152,7 @@ typedef struct HashJoinTableData
    148152
    bool growEnabled; /* flag to shut off nbatch increases */
    149153

    150154
    double totalTuples; /* # tuples obtained from inner plan */
    155+
    double skewTuples; /* # tuples inserted into skew tuples */
    151156

    152157
    /*
    153158
    * These arrays are allocated for the life of the hash join, but only if

    0 commit comments

    Comments
     (0)
    0