8000 Perform an immediate shutdown if the postmaster.pid file is removed. · SudhirLonkar/postgres@dea6da1 · GitHub
[go: up one dir, main page]

Skip to content

Commit dea6da1

Browse files
committed
Perform an immediate shutdown if the postmaster.pid file is removed.
The postmaster now checks every minute or so (worst case, at most two minutes) that postmaster.pid is still there and still contains its own PID. If not, it performs an immediate shutdown, as though it had received SIGQUIT. The original goal behind this change was to ensure that failed buildfarm runs would get fully cleaned up, even if the test scripts had left a postmaster running, which is not an infrequent occurrence. When the buildfarm script removes a test postmaster's $PGDATA directory, its next check on postmaster.pid will fail and cause it to exit. Previously, manual intervention was often needed to get rid of such orphaned postmasters, since they'd block new test postmasters from obtaining the expected socket address. However, by checking postmaster.pid and not something else, we can provide additional robustness: manual removal of postmaster.pid is a frequent DBA mistake, and now we can at least limit the damage that will ensue if a new postmaster is started while the old one is still alive. Back-patch to all supported branches, since we won't get the desired improvement in buildfarm reliability otherwise.
1 parent f0ceb25 commit dea6da1

File tree

3 files changed

+113
-12
lines changed

3 files changed

+113
-12
lines changed

src/backend/postmaster/postmaster.c

Lines changed: 42 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1471,9 +1471,10 @@ ServerLoop(void)
14711471
fd_set readmask;
14721472
int nSockets;
14731473
time_t now,
1474+
last_lockfile_recheck_time,
14741475
last_touch_time;
14751476

1476-
last_touch_time = time(NULL);
1477+
last_lockfile_recheck_time = last_touch_time = time(NULL);
14771478

14781479
nSockets = initMasks(&readmask);
14791480

@@ -1614,27 +1615,56 @@ ServerLoop(void)
16141615
kill(AutoVacPID, SIGUSR2);
16151616
}
16161617

1618+
#ifdef HAVE_PTHREAD_IS_THREADED_NP
1619+
1620+
/*
1621+
* With assertions enabled, check regularly for appearance of
1622+
* additional threads. All builds check at start and exit.
1623+
*/
1624+
Assert(pthread_is_threaded_np() == 0);
1625+
#endif
1626+
1627+
/*
1628+
* Lastly, check to see if it's time to do some things that we don't
1629+
* want to do every single time through the loop, because they're a
1630+
* bit expensive. Note that there's up to a minute of slop in when
1631+
* these tasks will be performed, since DetermineSleepTime() will let
1632+
* us sleep at most that long.
1633+
*/
1634+
now = time(NULL);
1635+
16171636
/*
1618-
* Touch the socket and lock file every 58 minutes, to ensure that
1637+
* Once a minute, verify that postmaster.pid hasn't been removed or
1638+
* overwritten. If it has, we force a shutdown. This avoids having
1639+
* postmasters and child processes hanging around after their database
1640+
* is gone, and maybe causing problems if a new database cluster is
1641+
* created in the same place. It also provides some protection
1642+
* against a DBA foolishly removing postmaster.pid and manually
1643+
* starting a new postmaster. Data corruption is likely to ensue from
1644+
* that anyway, but we can minimize the damage by aborting ASAP.
1645+
*/
1646+
if (now - last_lockfile_recheck_time >= 1 * SECS_PER_MINUTE)
1647+
{
1648+
if (!RecheckDataDirLockFile())
1649+
{
1650+
ereport(LOG,
1651+
(errmsg("performing immediate shutdown because data directory lock file is invalid")));
1652+
kill(MyProcPid, SIGQUIT);
1653+
}
1654+
last_lockfile_recheck_time = now;
1655+
}
1656+
1657+
/*
1658+
* Touch Unix socket and lock file every 58 minutes, to ensure that
16191659
* they are not removed by overzealous /tmp-cleaning tasks. We assume
16201660
* no one runs cleaners with cutoff times of less than an hour ...
16211661
*/
1622-
now = time(NULL);
16231662
if (now - last_touch_time >= 58 * SECS_PER_MINUTE)
16241663
{
16251664
TouchSocketFile();
16261665
TouchSocketLockFile();
16271666
last_touch_time = now;
16281667
}
1629-
1630-
#ifdef HAVE_PTHREAD_IS_THREADED_NP
1631-
1632-
/*
1633-
* With assertions enabled, check regularly for appearance of
1634-
* additional threads. All builds check at start and exit.
1635-
*/
1636-
Assert(pthread_is_threaded_np() == 0);
1637-
#endif
16381668
}
16391669
}
16401670

src/backend/utils/init/miscinit.c

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1116,6 +1116,76 @@ AddToDataDirLockFile(int target_line, const char *str)
11161116
}
11171117

11181118

1119+
/*
1120+
* Recheck that the data directory lock file still exists with expected
1121+
* content. Return TRUE if the lock file appears OK, FALSE if it isn't.
1122+
*
1123+
* We call this periodically in the postmaster. The idea is that if the
1124+
* lock file has been removed or replaced by another postmaster, we should
1125+
* do a panic database shutdown. Therefore, we should return TRUE if there
1126+
* is any doubt: we do not want to cause a panic shutdown unnecessarily.
1127+
* Transient failures like EINTR or ENFILE should not cause us to fail.
1128+
* (If there really is something wrong, we'll detect it on a future recheck.)
1129+
*/
1130+
bool
1131+
RecheckDataDirLockFile(void)
1132+
{
1133+
int fd;
1134+
int len;
1135+
long file_pid;
1136+
char buffer[BLCKSZ];
1137+
1138+
fd = open(DIRECTORY_LOCK_FILE, O_RDWR | PG_BINARY, 0);
1139+
if (fd < 0)
1140+
{
1141+
/*
1142+
* There are many foreseeable false-positive error conditions. For
1143+
* safety, fail only on enumerated clearly-something-is-wrong
1144+
* conditions.
1145+
*/
1146+
switch (errno)
1147+
{
1148+
case ENOENT:
1149+
case ENOTDIR:
1150+
/* disaster */
1151+
ereport(LOG,
1152+
(errcode_for_file_access(),
1153+
errmsg("could not open file \"%s\": %m",
1154+
DIRECTORY_LOCK_FILE)));
1155+
return false;
1156+
default:
1157+
/* non-fatal, at least for now */
1158+
ereport(LOG,
1159+
(errcode_for_file_access(),
1160+
errmsg("could not open file \"%s\": %m; continuing anyway",
1161+
DIRECTORY_LOCK_FILE)));
1162+
return true;
1163+
}
1164+
}
1165+
len = read(fd, buffer, sizeof(buffer) - 1);
1166+
if (len < 0)
1167+
{
1168+
ereport(LOG,
1169+
(errcode_for_file_access(),
1170+
errmsg("could not read from file \"%s\": %m",
1171+
DIRECTORY_LOCK_FILE)));
1172+
close(fd);
1173+
return true; /* treat read failure as nonfatal */
1174+
}
1175+
buffer[len] = '\0';
1176+
close(fd);
1177+
file_pid = atol(buffer);
1178+
if (file_pid == getpid())
1179+
return true; /* all is well */
1180+
1181+
/* Trouble: someone's overwritten the lock file */
1182+
ereport(LOG,
1183+
(errmsg("lock file \"%s\" contains wrong PID: %ld instead of %ld",
1184+
DIRECTORY_LOCK_FILE, file_pid, (long) getpid())));
1185+
return false;
1186+
}
1187+
1188+
11191189
/*-------------------------------------------------------------------------
11201190
* Version checking support
11211191
*-------------------------------------------------------------------------

src/include/miscadmin.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -405,6 +405,7 @@ extern void CreateDataDirLockFile(bool amPostmaster);
405405
extern void CreateSocketLockFile(const char *socketfile, bool amPostmaster);
406406
extern void TouchSocketLockFile(void);
407407
extern void AddToDataDirLockFile(int target_line, const char *str);
408+
extern bool RecheckDataDirLockFile(void);
408409
extern void ValidatePgVersion(const char *path);
409410
extern void process_shared_preload_libraries(void);
410411
extern void process_local_preload_libraries(void);

0 commit comments

Comments
 (0)
0