@@ -355,10 +355,13 @@ typedef struct XLogCtlInsert
355355 * exclusiveBackup is true if a backup started with pg_start_backup() is
356356 * in progress, and nonExclusiveBackups is a counter indicating the number
357357 * of streaming base backups currently in progress. forcePageWrites is
358- * set to true when either of these is non-zero.
358+ * set to true when either of these is non-zero. lastBackupStart is the
359+ * latest checkpoint redo location used as a starting point for an online
360+ * backup.
359361 */
360362 bool exclusiveBackup ;
361363 int nonExclusiveBackups ;
364+ XLogRecPtr lastBackupStart ;
362365} XLogCtlInsert ;
363366
364367/*
@@ -8808,6 +8811,19 @@ do_pg_start_backup(const char *backupidstr, bool fast, char **labelfile)
88088811 errmsg ("backup label too long (max %d bytes)" ,
88098812 MAXPGPATH )));
88108813
8814+ /*
8815+ * Force an XLOG file switch before the checkpoint, to ensure that the WAL
8816+ * segment the checkpoint is written to doesn't contain pages with old
8817+ * timeline IDs. That would otherwise happen if you called
8818+ * pg_start_backup() right after restoring from a PITR archive: the first
8819+ * WAL segment containing the startup checkpoint has pages in the
8820+ * beginning with the old timeline ID. That can cause trouble at recovery:
8821+ * we won't have a history file covering the old timeline if pg_xlog
8822+ * directory was not included in the base backup and the WAL archive was
8823+ * cleared too before starting the backup.
8824+ */
8825+ RequestXLogSwitch ();
8826+
88118827 /*
88128828 * Mark backup active in shared memory. We must do full-page WAL writes
88138829 * during an on-line backup even if not doing so at other times, because
@@ -8843,43 +8859,54 @@ do_pg_start_backup(const char *backupidstr, bool fast, char **labelfile)
88438859 XLogCtl -> Insert .forcePageWrites = true;
88448860 LWLockRelease (WALInsertLock );
88458861
8846- /*
8847- * Force an XLOG file switch before the checkpoint, to ensure that the WAL
8848- * segment the checkpoint is written to doesn't contain pages with old
8849- * timeline IDs. That would otherwise happen if you called
8850- * pg_start_backup() right after restoring from a PITR archive: the first
8851- * WAL segment containing the startup checkpoint has pages in the
8852- * beginning with the old timeline ID. That can cause trouble at recovery:
8853- * we won't have a history file covering the old timeline if pg_xlog
8854- * directory was not included in the base backup and the WAL archive was
8855- * cleared too before starting the backup.
8856- */
8857- RequestXLogSwitch ();
8858-
88598862 /* Ensure we release forcePageWrites if fail below */
88608863 PG_ENSURE_ERROR_CLEANUP (pg_start_backup_callback , (Datum ) BoolGetDatum (exclusive ));
88618864 {
8862- /*
8863- * Force a CHECKPOINT. Aside from being necessary to prevent torn
8864- * page problems, this guarantees that two successive backup runs will
8865- * have different checkpoint positions and hence different history
8866- * file names, even if nothing happened in between.
8867- *
8868- * We use CHECKPOINT_IMMEDIATE only if requested by user (via passing
8869- * fast = true). Otherwise this can take awhile.
8870- */
8871- RequestCheckpoint (CHECKPOINT_FORCE | CHECKPOINT_WAIT |
8872- (fast ? CHECKPOINT_IMMEDIATE : 0 ));
8865+ bool gotUniqueStartpoint = false;
8866+ do
8867+ {
8868+ /*
8869+ * Force a CHECKPOINT. Aside from being necessary to prevent torn
8870+ * page problems, this guarantees that two successive backup runs will
8871+ * have different checkpoint positions and hence different history
8872+ * file names, even if nothing happened in between.
8873+ *
8874+ * We use CHECKPOINT_IMMEDIATE only if requested by user (via passing
8875+ * fast = true). Otherwise this can take awhile.
8876+ */
8877+ RequestCheckpoint (CHECKPOINT_FORCE | CHECKPOINT_WAIT |
8878+ (fast ? CHECKPOINT_IMMEDIATE : 0 ));
88738879
8874- /*
8875- * Now we need to fetch the checkpoint record location, and also its
8876- * REDO pointer. The oldest point in WAL that would be needed to
8877- * restore starting from the checkpoint is precisely the REDO pointer.
8878- */
8879- LWLockAcquire (ControlFileLock , LW_SHARED );
8880- checkpointloc = ControlFile -> checkPoint ;
8881- startpoint = ControlFile -> checkPointCopy .redo ;
8882- LWLockRelease (ControlFileLock );
8880+ /*
8881+ * Now we need to fetch the checkpoint record location, and also its
8882+ * REDO pointer. The oldest point in WAL that would be needed to
8883+ * restore starting from the checkpoint is precisely the REDO pointer.
8884+ */
8885+ LWLockAcquire (ControlFileLock , LW_SHARED );
8886+ checkpointloc = ControlFile -> checkPoint ;
8887+ startpoint = ControlFile -> checkPointCopy .redo ;
8888+ LWLockRelease (ControlFileLock );
8889+
8890+ /*
8891+ * If two base backups are started at the same time (in WAL
8892+ * sender processes), we need to make sure that they use
8893+ * different checkpoints as starting locations, because we use
8894+ * the starting WAL location as a unique identifier for the base
8895+ * backup in the end-of-backup WAL record and when we write the
8896+ * backup history file. Perhaps it would be better generate a
8897+ * separate unique ID for each backup instead of forcing another
8898+ * checkpoint, but taking a checkpoint right after another is
8899+ * not that expensive either because only few buffers have been
8900+ * dirtied yet.
8901+ */
8902+ LWLockAcquire (WALInsertLock , LW_SHARED );
8903+ if (XLByteLT (XLogCtl -> Insert .lastBackupStart , startpoint ))
8904+ {
8905+ XLogCtl -> Insert .lastBackupStart = startpoint ;
8906+ gotUniqueStartpoint = true;
8907+ }
8908+ LWLockRelease (WALInsertLock );
8909+ } while (!gotUniqueStartpoint );
88838910
88848911 XLByteToSeg (startpoint , _logId , _logSeg );
88858912 XLogFileName (xlogfilename , ThisTimeLineID , _logId , _logSeg );
0 commit comments