8000 Fix dynahash.c to suppress hash bucket splits while a hash_seq_search… · commandprompt/postgres@2d91f67 · GitHub
[go: up one dir, main page]

Skip to content

Commit 2d91f67

Browse files
committed
Fix dynahash.c to suppress hash bucket splits while a hash_seq_search() scan
is in progress on the same hashtable. This seems the least invasive way to fix the recently-recognized problem that a split could cause the scan to visit entries twice or (with much lower probability) miss them entirely. The only field-reported problem caused by this is the "failed to re-find shared lock object" PANIC in COMMIT PREPARED reported by Michel Dorochevsky, which was caused by multiply visited entries. However, it seems certain that mdsync() is vulnerable to missing required fsync's due to missed entries, and I am fearful that RelationCacheInitializePhase2() might be at risk as well. Because of that and the generalized hazard presented by this bug, back-patch all the supported branches. Along the way, fix pg_prepared_statement() and pg_cursor() to not assume that the hashtables they are examining will stay static between calls. This is risky regardless of the newly noted dynahash problem, because hash_seq_search() has never promised to cope with deletion of table entries other than the just-returned one. There may be no bug here because the only supported way to call these functions is via ExecMakeTableFunctionResult() which will cycle them to completion before doing anything very interesting, but it seems best to get rid of the assumption. This affects 8.2 and HEAD only, since those functions weren't there earlier.
1 parent a8ac5f7 commit 2d91f67

File tree

3 files changed

+174
-6
lines changed

3 files changed

+174
-6
lines changed

src/backend/access/transam/xact.c

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
*
99
*
1010
* IDENTIFICATION
11-
* $Header: /cvsroot/pgsql/src/backend/access/transam/xact.c,v 1.135.2.3 2006/05/21 20:07:11 tgl Exp $
11+
* $Header: /cvsroot/pgsql/src/backend/access/transam/xact.c,v 1.135.2.4 2007/04/26 23:25:48 tgl Exp $
1212
*
1313
* NOTES
1414
* Transaction aborts can now occur two ways:
@@ -1053,6 +1053,7 @@ CommitTransaction(void)
10531053
AtEOXact_Namespace(true);
10541054
AtEOXact_CatCache(true);
10551055
AtEOXact_Files();
1056+
AtEOXact_HashTables(true);
10561057
pgstat_count_xact_commit();
10571058
AtCommit_Memory();
10581059

@@ -1168,6 +1169,7 @@ AbortTransaction(void)
11681169
AtEOXact_Namespace(false);
11691170
AtEOXact_CatCache(false);
11701171
AtEOXact_Files();
1172+
AtEOXact_HashTables(false);
11711173
pgstat_count_xact_rollback();
11721174

11731175
/*

src/backend/utils/hash/dynahash.c

Lines changed: 165 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
*
1010
*
1111
* IDENTIFICATION
12-
* $Header: /cvsroot/pgsql/src/backend/utils/hash/dynahash.c,v 1.45 2002/10/31 21:59:32 tgl Exp $
12+
* $Header: /cvsroot/pgsql/src/backend/utils/hash/dynahash.c,v 1.45.2.1 2007/04/26 23:25:48 tgl Exp $
1313
*
1414
*-------------------------------------------------------------------------
1515
*/
@@ -71,6 +71,9 @@ static bool expand_table(HTAB *hashp);
7171
static bool hdefault(HTAB *hashp);
7272
static bool init_htab(HTAB *hashp, long nelem);
7373
static void hash_corrupted(HTAB *hashp);
74+
static void register_seq_scan(HTAB *hashp);
75+
static void deregister_seq_scan(HTAB *hashp);
76+
static bool has_seq_scans(HTAB *hashp);
7477

7578

7679
/*
@@ -166,6 +169,8 @@ hash_create(const char *tabname, long nelem, HASHCTL *info, int flags)
166169
return NULL;
167170
}
168171

172+
hashp->frozen = false;
173+
169174
if (!hdefault(hashp))
170175
return NULL;
171176

@@ -623,6 +628,10 @@ hash_search(HTAB *hashp,
623628
if (currBucket != NULL)
624629
return (void *) ELEMENTKEY(currBucket);
625630

631+
/* disallow inserts if frozen */
632+
if (hashp->frozen)
633+
elog(ERROR, "cannot insert into a frozen hashtable");
634+
626635
/* get the next free element */
627636
currBucket = hctl->freeList;
628637
if (currBucket == NULL)
@@ -645,8 +654,12 @@ hash_search(HTAB *hashp,
645654

646655
/* caller is expected to fill the data field on return */
647656

648-
/* Check if it is time to split the segment */
649-
if (++hctl->nentries / (long) (hctl->max_bucket + 1) > hctl->< A3DB /span>ffactor)
657+
/*
658+
* Check if it is time to split a bucket. Can't split if table
659+
* is the subject of any active hash_seq_search scans.
660+
*/
661+
if (++hctl->nentries / (long) (hctl->max_bucket + 1) >= hctl->ffactor &&
662+
!has_seq_scans(hashp))
650663
{
651664
/*
652665
* NOTE: failure to expand table is not a fatal error, it
@@ -665,22 +678,34 @@ hash_search(HTAB *hashp,
665678
}
666679

667680
/*
668-
* hash_seq_init/_search
681+
* hash_seq_init/_search/_term
669682
* Sequentially search through hash table and return
670683
* all the elements one by one, return NULL when no more.
671684
*
685+
* hash_seq_term should be called if and only if the scan is abandoned before
686+
* completion; if hash_seq_search returns NULL then it has already done the
687+
* end-of-scan cleanup.
688+
*
672689
* NOTE: caller may delete the returned element before continuing the scan.
673690
* However, deleting any other element while the scan is in progress is
674691
* UNDEFINED (it might be the one that curIndex is pointing at!). Also,
675692
* if elements are added to the table while the scan is in progress, it is
676693
* unspecified whether they will be visited by the scan or not.
694+
*
695+
* NOTE: it is possible to use hash_seq_init/hash_seq_search without any
696+
* worry about hash_seq_term cleanup, if the hashtable is first locked against
697+
* further insertions by calling hash_freeze. This is used by nodeAgg.c,
698+
* wherein it is inconvenient to track whether a scan is still open, and
699+
* there's no possibility of further insertions after readout has begun.
677700
*/
678701
void
679702
hash_seq_init(HASH_SEQ_STATUS *status, HTAB *hashp)
680703
{
681704
status->hashp = hashp;
682705
status->curBucket = 0;
683706
status->curEntry = NULL;
707+
if (!hashp->frozen)
708+
register_seq_scan(hashp);
684709
}
685710

686711
void *
@@ -734,9 +759,40 @@ hash_seq_search(HASH_SEQ_STATUS *status)
734759
++status->curBucket;
735760
}
736761

762+
hash_seq_term(status);
737763
return NULL; /* out of buckets */
738764
}
739765

766+
void
767+
hash_seq_term(HASH_SEQ_STATUS *status)
768+
{
769+
if (!status->hashp->frozen)
770+
deregister_seq_scan(status->hashp);
771+
}
772+
773+
/*
774+
* hash_freeze
775+
* Freeze a hashtable against future insertions (deletions are
776+
* still allowed)
777+
*
778+
* The reason for doing this is that by preventing any more bucket splits,
779+
* we no longer need to worry about registering hash_seq_search scans,
780+
* and thus caller need not be careful about ensuring hash_seq_term gets
781+
* called at the right times.
782+
*
783+
* Multiple calls to hash_freeze() are allowed, but you can't freeze a table
784+
* with active scans (since hash_seq_term would then do the wrong thing).
785+
*/
786+
void
787+
hash_freeze(HTAB *hashp)
788+
{
789+
if (hashp->isshared)
790+
elog(ERROR, "cannot freeze shared hashtable");
791+
if (!hashp->frozen && has_seq_scans(hashp))
792+
elog(ERROR, "cannot freeze hashtable with active scans");
793+
hashp->frozen = true;
794+
}
795+
740796

741797
/********************************* UTILITIES ************************/
742798

@@ -948,3 +1004,108 @@ my_log2(long num)
9481004
;
9491005
return i;
9501006
}
1007+
1008+
1009+
/************************* SEQ SCAN TRACKING ************************/
1010+
1011+
/*
1012+
* We track active hash_seq_search scans here. The need for this mechanism
1013+
* comes from the fact that a scan will get confused if a bucket split occurs
1014+
* while it's i F42D n progress: it might visit entries twice, or even miss some
1015+
* entirely (if it's partway through the same bucket that splits). Hence
1016+
* we want to inhibit bucket splits if there are any active scans on the
1017+
* table being inserted into. This is a fairly rare case in current usage,
1018+
* so just postponing the split until the next insertion seems sufficient.
1019+
*
1020+
* Given present usages of the function, only a few scans are likely to be
1021+
* open concurrently; so a finite-size stack of open scans seems sufficient,
1022+
* and we don't worry that linear search is too slow. Note that we do
1023+
* allow multiple scans of the same hashtable to be open concurrently.
1024+
*
1025+
* This mechanism can support concurrent scan and insertion in a shared
1026+
* hashtable if it's the same backend doing both. It would fail otherwise,
1027+
* but locking reasons seem to preclude any such scenario anyway, so we don't
1028+
* worry.
1029+
*
1030+
* This arrangement is reasonably robust if a transient hashtable is deleted
1031+
* without notifying us. The absolute worst case is we might inhibit splits
1032+
* in another table created later at exactly the same address. We will give
1033+
* a warning at transaction end for reference leaks, so any bugs leading to
1034+
* lack of notification should be easy to catch.
1035+
*/
1036+
1037+
#define MAX_SEQ_SCANS 100
1038+
1039+
static HTAB *seq_scan_tables[MAX_SEQ_SCANS]; /* tables being scanned */
1040+
static int num_seq_scans = 0;
1041+
1042+
1043+
/* Register a table as having an active hash_seq_search scan */
1044+
static void
1045+
register_seq_scan(HTAB *hashp)
1046+
{
1047+
if (num_seq_scans >= MAX_SEQ_SCANS)
1048+
elog(ERROR, "too many active hash_seq_search scans");
1049+
seq_scan_tables[num_seq_scans] = hashp;
1050+
num_seq_scans++;
1051+
}
1052+
1053+
/* Deregister an active scan */
1054+
static void
1055+
deregister_seq_scan(HTAB *hashp)
1056+
{
1057+
int i;
1058+
1059+
/* Search backward since it's most likely at the stack top */
1060+
for (i = num_seq_scans - 1; i >= 0; i--)
1061+
{
1062+
if (seq_scan_tables[i] == hashp)
1063+
{
1064+
seq_scan_tables[i] = seq_scan_tables[num_seq_scans - 1];
1065+
num_seq_scans--;
1066+
return;
1067+
}
1068+
}
1069+
elog(ERROR, "no hash_seq_search scan for hash table \"%s\"",
1070+
hashp->tabname);
1071+
}
1072+
1073+
/* Check if a table has any active scan */
1074+
static bool
1075+
has_seq_scans(HTAB *hashp)
1076+
{
1077+
int i;
1078+
1079+
for (i = 0; i < num_seq_scans; i++)
1080+
{
1081+
if (seq_scan_tables[i] == hashp)
1082+
return true;
1083+
}
1084+
return false;
1085+
}
1086+
1087+
/* Clean up any open scans at end of transaction */
1088+
void
1089+
AtEOXact_HashTables(bool isCommit)
1090+
{
1091+
/*
1092+
* During abort cleanup, open scans are expected; just silently clean 'em
1093+
* out. An open scan at commit means someone forgot a hash_seq_term()
1094+
* call, so complain.
1095+
*
1096+
* Note: it's tempting to try to print the tabname here, but refrain for
1097+
* fear of touching deallocated memory. This isn't a user-facing message
1098+
* anyway, so it needn't be pretty.
1099+
*/
1100+
if (isCommit)
1101+
{
1102+
int i;
1103+
1104+
for (i = 0; i < num_seq_scans; i++)
1105+
{
1106+
elog(WARNING, "leaked hash_seq_search scan for hash table %p",
1107+
seq_scan_tables[i]);
1108+
}
1109+
}
1110+
num_seq_scans = 0;
1111+
}

src/include/utils/hsearch.h

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
* Portions Copyright (c) 1996-2002, PostgreSQL Global Development Group
88
* Portions Copyright (c) 1994, Regents of the University of California
99
*
10-
* $Id: hsearch.h,v 1.27 2002/06/20 20:29:53 momjian Exp $
10+
* $Id: hsearch.h,v 1.27.2.1 2007/04/26 23:25:48 tgl Exp $
1111
*
1212
*-------------------------------------------------------------------------
1313
*/
@@ -89,6 +89,8 @@ typedef struct HTAB
8989
* used */
9090
char *tabname; /* table name (for error messages) */
9191
bool isshared; /* true if table is in shared memory */
92+
/* freezing a shared table isn't allowed, so we can keep state here */
93+
bool frozen; /* true = no more inserts allowed */
9294
} HTAB;
9395

9496
/* Parameter data structure for hash_create */
@@ -155,8 +157,11 @@ extern void *hash_search(HTAB *hashp, void *keyPtr, HASHACTION action,
155157
bool *foundPtr);
156158
extern void hash_seq_init(HASH_SEQ_STATUS *status, HTAB *hashp);
157159
extern void *hash_seq_search(HASH_SEQ_STATUS *status);
160+
extern void hash_seq_term(HASH_SEQ_STATUS *status);
161+
extern void hash_freeze(HTAB *hashp);
158162
extern long hash_estimate_size(long num_entries, long entrysize);
159163
extern long hash_select_dirsize(long num_entries);
164+
extern void AtEOXact_HashTables(bool isCommit);
160165

161166
/*
162167
* prototypes for functions in hashfn.c

0 commit comments

Comments
 (0)
0