8000 Fix an assertion failure related to an exclusive backup. · intobs/postgres@9e7f00d · GitHub
[go: up one dir, main page]

Skip to content

Commit 9e7f00d

Browse files
committed
Fix an assertion failure related to an exclusive backup.
Previously multiple sessions could execute pg_start_backup() and pg_stop_backup() to start and stop an exclusive backup at the same time. This could trigger the assertion failure of "FailedAssertion("!(XLogCtl->Insert.exclusiveBackup)". This happend because, even while pg_start_backup() was starting an exclusive backup, other session could run pg_stop_backup() concurrently and mark the backup as not-in-progress unconditionally. This patch introduces ExclusiveBackupState indicating the state of an exclusive backup. This state is used to ensure that there is only one session running pg_start_backup() or pg_stop_backup() at the same time, to avoid the assertion failure. Back-patch to all supported versions. Author: Michael Paquier Reviewed-By: Kyotaro Horiguchi and me Reported-By: Andreas Seltenreich Discussion: <87mvktojme.fsf@credativ.de>
1 parent c377557 commit 9e7f00d

File tree

1 file changed

+148
-62
lines changed
  • src/backend/access/transam

1 file changed

+148
-62
lines changed

src/backend/access/transam/xlog.c

Lines changed: 148 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -417,6 +417,29 @@ typedef union WALInsertLockPadded
417417
char pad[PG_CACHE_LINE_SIZE];
418418
} WALInsertLockPadded;
419419

420+
/*
421+
* State of an exclusive backup, necessary to control concurrent activities
422+
* across sessions when working on exclusive backups.
423+
*
424+
* EXCLUSIVE_BACKUP_NONE means that there is no exclusive backup actually
425+
* running, to be more precise pg_start_backup() is not being executed for
426+
* an exclusive backup and there is no exclusive backup in progress.
427+
* EXCLUSIVE_BACKUP_STARTING means that pg_start_backup() is starting an
428+
* exclusive backup.
429+
* EXCLUSIVE_BACKUP_IN_PROGRESS means that pg_start_backup() has finished
430+
* running and an exclusive backup is in progress. pg_stop_backup() is
431+
* needed to finish it.
432+
* EXCLUSIVE_BACKUP_STOPPING means that pg_stop_backup() is stopping an
433+
* exclusive backup.
434+
*/
435+
typedef enum ExclusiveBackupState
436+
{
437+
EXCLUSIVE_BACKUP_NONE = 0,
438+
EXCLUSIVE_BACKUP_STARTING,
439+
EXCLUSIVE_BACKUP_IN_PROGRESS,
440+
EXCLUSIVE_BACKUP_STOPPING
441+
} ExclusiveBackupState;
442+
420443
/*
421444
* Shared state data for XLogInsert.
422445
*/
@@ -458,13 +481,15 @@ typedef struct XLogCtlInsert
458481
bool fullPageWrites;
459482

460483
/*
461-
* exclusiveBackup is true if a backup started with pg_start_backup() is
462-
* in progress, and nonExclusiveBackups is a counter indicating the number
463-
* of streaming base backups currently in progress. forcePageWrites is set
464-
* to true when either of these is non-zero. lastBackupStart is the latest
465-
* checkpoint redo location used as a starting point for an online backup.
484+
* exclusiveBackupState indicates the state of an exclusive backup
485+
* (see comments of ExclusiveBackupState for more details).
486+
* nonExclusiveBackups is a counter indicating the number of streaming
487+
* base backups currently in progress. forcePageWrites is set to true
488+
* when either of these is non-zero. lastBackupStart is the latest
489+
* checkpoint redo location used as a starting point for an online
490+
* backup.
466491
*/
467-
bool exclusiveBackup;
492+
ExclusiveBackupState exclusiveBackupState;
468493
int nonExclusiveBackups;
469494
XLogRecPtr lastBackupStart;
470495

@@ -806,6 +831,7 @@ static bool CheckForStandbyTrigger(void);
806831
static void xlog_outrec(StringInfo buf, XLogRecord *record);
807832
#endif
808833
static void pg_start_backup_callback(int code, Datum arg);
834+
static void pg_stop_backup_callback(int code, Datum arg);
809835
static bool read_backup_label(XLogRecPtr *checkPointLoc,
810836
bool *backupEndRequired, bool *backupFromStandby);
811837
static void rm_redo_error_callback(void *arg);
@@ -9880,15 +9906,20 @@ do_pg_start_backup(const char *backupidstr, bool fast, TimeLineID *starttli_p,
98809906
WALInsertLockAcquireExclusive();
98819907
if (exclusive)
98829908
{
9883-
if (XLogCtl->Insert.exclusiveBackup)
9909+
/*
9910+
* At first, mark that we're now starting an exclusive backup,
9911+
* to ensure that there are no other sessions currently running
9912+
* pg_start_backup() or pg_stop_backup().
9913+
*/
9914+
if (XLogCtl->Insert.exclusiveBackupState != EXCLUSIVE_BACKUP_NONE)
98849915
{
98859916
WALInsertLockRelease();
98869917
ereport(ERROR,
98879918
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
98889919
errmsg("a backup is already in progress"),
98899920
errhint("Run pg_stop_backup() and try again.")));
98909921
}
9891-
XLogCtl->Insert.exclusiveBackup = true;
9922+
XLogCtl->Insert.exclusiveBackupState = EXCLUSIVE_BACKUP_STARTING;
98929923
}
98939924
else
98949925
XLogCtl->Insert.nonExclusiveBackups++;
@@ -10048,7 +10079,7 @@ do_pg_start_backup(const char *backupidstr, bool fast, TimeLineID *starttli_p,
1004810079
{
1004910080
/*
1005010081
* Check for existing backup label --- implies a backup is already
10051-
* running. (XXX given that we checked exclusiveBackup above,
10082+
* running. (XXX given that we checked exclusiveBackupState above,
1005210083
* maybe it would be OK to just unlink any such label file?)
1005310084
*/
1005410085
if (stat(BACKUP_LABEL_FILE, &stat_buf) != 0)
@@ -10089,6 +10120,16 @@ do_pg_start_backup(const char *backupidstr, bool fast, TimeLineID *starttli_p,
1008910120
}
1009010121
PG_END_ENSURE_ERROR_CLEANUP(pg_start_backup_callback, (Datum) BoolGetDatum(exclusive));
1009110122

10123+
/*
10124+
* Mark that start phase has correctly finished for an exclusive backup.
10125+
*/
10126+
if (exclusive)
10127+
{
10128+
WALInsertLockAcquireExclusive();
10129+
XLogCtl->Insert.exclusiveBackupState = EXCLUSIVE_BACKUP_IN_PROGRESS;
10130+
WALInsertLockRelease();
10131+
}
10132+
1009210133
/*
1009310134
* We're done. As a convenience, return the starting WAL location.
1009410135
*/
@@ -10107,23 +10148,41 @@ pg_start_backup_callback(int code, Datum arg)
1010710148
WALInsertLockAcquireExclusive();
1010810149
if (exclusive)
1010910150
{
10110-
Assert(XLogCtl->Insert.exclusiveBackup);
10111-
XLogCtl->Insert.exclusiveBackup = false;
10151+
Assert(XLogCtl->Insert.exclusiveBackupState == EXCLUSIVE_BACKUP_STARTING);
10152+
XLogCtl->Insert.exclusiveBackupState = EXCLUSIVE_BACKUP_NONE;
1011210153
}
1011310154
else
1011410155
{
1011510156
Assert(XLogCtl->Insert.nonExclusiveBackups > 0);
1011610157
XLogCtl->Insert.nonExclusiveBackups--;
1011710158
}
1011810159

10119-
if (!XLogCtl->Insert.exclusiveBackup &&
10160+
if (XLogCtl->Insert.exclusiveBackupState == EXCLUSIVE_BACKUP_NONE &&
1012010161
XLogCtl->Insert.nonExclusiveBackups == 0)
1012110162
{
1012210163
XLogCtl->Insert.forcePageWrites = false;
1012310164
}
1012410165
WALInsertLockRelease();
1012510166
}
1012610167

10168+
/*
10169+
* Error cleanup callback for pg_stop_backup
10170+
*/
10171+
static void
10172+
pg_stop_backup_callback(int code, Datum arg)
10173+
{
10174+
bool exclusive = DatumGetBool(arg);
10175+
10176+
/* Update backup status on failure */
10177+
WALInsertLockAcquireExclusive();
10178+
if (exclusive)
10179+
{
10180+
Assert(XLogCtl->Insert.exclusiveBackupState == EXCLUSIVE_BACKUP_STOPPING);
10181+
XLogCtl->Insert.exclusiveBackupState = EXCLUSIVE_BACKUP_IN_PROGRESS;
10182+
}
10183+
WALInsertLockRelease();
10184+
}
10185+
1012710186
/*
1012810187
* do_pg_stop_backup is the workhorse of the user-visible pg_stop_backup()
1012910188
* function.
@@ -10187,12 +10246,86 @@ do_pg_stop_backup(char *labelfile, bool waitforarchive, TimeLineID *stoptli_p)
1018710246
errmsg("WAL level not sufficient for making an online backup"),
1018810247
errhint("wal_level must be set to \"archive\", \"hot_standby\", or \"logical\" at server start.")));
1018910248

10249+
if (exclusive)
10250+
{
10251+
/*
10252+
* At first, mark that we're now stopping an exclusive backup,
10253+
* to ensure that there are no other sessions currently running
10254+
* pg_start_backup() or pg_stop_backup().
10255+
*/
10256+
WALInsertLockAcquireExclusive();
10257+
if (XLogCtl->Insert.exclusiveBackupState != EXCLUSIVE_BACKUP_IN_PROGRESS)
10258+
{
10259+
WALInsertLockRelease();
10260+
ereport(ERROR,
10261+
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10262+
errmsg("exclusive backup not in progress")));
10263+
}
10264+
XLogCtl->Insert.exclusiveBackupState = EXCLUSIVE_BACKUP_STOPPING;
10265+
WALInsertLockRelease();
10266+
10267+
/*
10268+
* Remove backup_label. In case of failure, the state for an exclusive
10269+
* backup is switched back to in-progress.
10270+
*/
10271+
PG_ENSURE_ERROR_CLEANUP(pg_stop_backup_callback, (Datum) BoolGetDatum(exclusive));
10272+
{
10273+
/*
10274+
* Read the existing label file into memory.
10275+
*/
10276+
struct stat statbuf;
10277+
int r;
10278+
10279+
if (stat(BACKUP_LABEL_FILE, &statbuf))
10280+
{
10281+
/* should not happen per the upper checks */
10282+
if (errno != ENOENT)
10283+
ereport(ERROR,
10284+
(errcode_for_file_access(),
10285+
errmsg("could not stat file \"%s\": %m",
10286+
BACKUP_LABEL_FILE)));
10287+
ereport(ERROR,
10288+
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10289+
errmsg("a backup is not in progress")));
10290+
}
10291+
10292+
lfp = AllocateFile(BACKUP_LABEL_FILE, "r");
10293+
if (!lfp)
10294+
{
10295+
ereport(ERROR,
10296+
(errcode_for_file_access(),
10297+
errmsg("could not read file \"%s\": %m",
10298+
BACKUP_LABEL_FILE)));
10299+
}
10300+
labelfile = palloc(statbuf.st_size + 1);
10301+
r = fread(labelfile, statbuf.st_size, 1, lfp);
10302+
labelfile[statbuf.st_size] = '\0';
10303+
10304+
/*
10305+
* Close and remove the backup label file
10306+
*/
10307+
if (r != 1 || ferror(lfp) || FreeFile(lfp))
10308+
ereport(ERROR,
10309+
(errcode_for_file_access(),
10310+
errmsg("could not read file \"%s\": %m",
10311+
BACKUP_LABEL_FILE)));
10312+
if (unlink(BACKUP_LABEL_FILE) != 0)
10313+
ereport(ERROR,
10314+
(errcode_for_file_access(),
10315+
errmsg("could not remove file \"%s\": %m",
10316+
BACKUP_LABEL_FILE)));
10317+
}
10318+
PG_END_ENSURE_ERROR_CLEANUP(pg_stop_backup_callback, (Datum) BoolGetDatum(exclusive));
10319+
}
10320+
1019010321
/*
1019110322
* OK to update backup counters and forcePageWrites
1019210323
*/
1019310324
WALInsertLockAcquireExclusive();
1019410325
if (exclusive)
10195-
XLogCtl->Insert.exclusiveBackup = false;
10326+
{
10327+
XLogCtl->Insert.exclusiveBackupState = EXCLUSIVE_BACKUP_NONE;
10328+
}
1019610329
else
1019710330
{
1019810331
/*
@@ -10205,60 +10338,13 @@ do_pg_stop_backup(char *labelfile, bool waitforarchive, TimeLineID *stoptli_p)
1020510338
XLogCtl->Insert.nonExclusiveBackups--;
1020610339
}
1020710340

10208-
if (!XLogCtl->Insert.exclusiveBackup &&
10341+
if (XLogCtl->Insert.exclusiveBackupState == EXCLUSIVE_BACKUP_NONE &&
1020910342
XLogCtl->Insert.nonExclusiveBackups == 0)
1021010343
{
1021110344
XLogCtl->Insert.forcePageWrites = false;
1021210345
}
1021310346
WALInsertLockRelease();
1021410347

10215-
if (exclusive)
10216-
{
10217-
/*
10218-
* Read the existing label file into memory.
10219-
*/
10220-
struct stat statbuf;
10221-
int r;
10222-
10223-
if (stat(BACKUP_LABEL_FILE, &statbuf))
10224-
{
10225-
if (errno != ENOENT)
10226-
ereport(ERROR,
10227-
(errcode_for_file_access(),
10228-
errmsg("could not stat file \"%s\": %m",
10229-
BACKUP_LABEL_FILE)));
10230-
ereport(ERROR,
10231-
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10232-
errmsg("a backup is not in progress")));
10233-
}
10234-
10235-
lfp = AllocateFile(BACKUP_LABEL_FILE, "r");
10236-
if (!lfp)
10237-
{
10238-
ereport(ERROR,
10239-
(errcode_for_file_access(),
10240-
errmsg("could not read file \"%s\": %m",
10241-
BACKUP_LABEL_FILE)));
10242-
}
10243-
labelfile = palloc(statbuf.st_size + 1);
10244-
r = fread(labelfile, statbuf.st_size, 1, lfp);
10245-
labelfile[statbuf.st_size] = '\0';
10246-
10247-
/*
10248-
* Close and remove the backup label file
10249-
*/
10250-
if (r != 1 || ferror(lfp) || FreeFile(lfp))
10251-
ereport(ERROR,
10252-
(errcode_for_file_access(),
10253-
errmsg("could not read file \"%s\": %m",
10254-
BACKUP_LABEL_FILE)));
10255-
if (unlink(BACKUP_LABEL_FILE) != 0)
10256-
ereport(ERROR,
10257-
(errcode_for_file_access(),
10258-
errmsg("could not remove file \"%s\": %m",
10259-
BACKUP_LABEL_FILE)));
10260-
}
10261-
1026210348
/*
1026310349
* Read and parse the START WAL LOCATION line (this code is pretty crude,
1026410350
* but we are not expecting any variability in the file format).
@@ -10499,7 +10585,7 @@ do_pg_abort_backup(void)
1049910585
Assert(XLogCtl->Insert.nonExclusiveBackups > 0);
1050010586
XLogCtl->Insert.nonExclusiveBackups--;
1050110587

10502-
if (!XLogCtl->Insert.exclusiveBackup &&
10588+
if (XLogCtl->Insert.exclusiveBackupState == EXCLUSIVE_BACKUP_NONE &&
1050310589
XLogCtl->Insert.nonExclusiveBackups == 0)
1050410590
{
1050510591
XLogCtl->Insert.forcePageWrites = false;

0 commit comments

Comments
 (0)
0