8000 Revert "Fix "pg_ctl start -w" to test child process status directly." · rtpg/postgres@d56c02f · GitHub
[go: up one dir, main page]

Skip to content

Commit d56c02f

Browse files
committed
Revert "Fix "pg_ctl start -w" to test child process status directly."
This reverts commit c869a7d. As pointed out by Maksym Sobolyev in bug #14199, that approach doesn't work if the postmaster forks itself an extra time due to silent_mode being enabled. We removed silent_mode in 9.2, so the pg_ctl change is fine in 9.2 and later, but it fails when that option is enabled in 9.1. Seeing that 9.1 is close to end-of-life, let's adopt the most conservative fix we can, which is to revert the pg_ctl change in the 9.1 branch. Discussion: <20160618042812.5798.85609@wrigleys.postgresql.org>
1 parent b9b925f commit d56c02f

File tree

1 file changed

+78
-107
lines changed

1 file changed

+78
-107
lines changed

src/bin/pg_ctl/pg_ctl.c

Lines changed: 78 additions & 107 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,6 @@
2828
#include <time.h>
2929
#include <sys/types.h>
3030
#include <sys/stat.h>
31-
#include <sys/wait.h>
3231
#include <unistd.h>
3332

3433
#ifdef HAVE_SYS_RESOURCE_H
@@ -155,10 +154,10 @@ static int pgwin32_is_service(void);
155154

156155
static pgpid_t get_pgpid(void);
157156
static char **readfile(const char *path);
158-
static pgpid_t start_postmaster(void);
157+
static int start_postmaster(void);
159158
static void read_post_opts(void);
160159

161-
static PGPing test_postmaster_connection(pgpid_t pm_pid, bool do_checkpoint);
160+
static PGPing test_postmaster_connection(bool);
162161
static bool postmaster_is_alive(pid_t pid);
163162

164163
#if defined(HAVE_GETRLIMIT) && defined(RLIMIT_CORE)
@@ -402,73 +401,36 @@ readfile(const char *path)
402401
* start/test/stop routines
403402
*/
404403

405-
/*
406-
* Start the postmaster and return its PID.
407-
*
408-
* Currently, on Windows what we return is the PID of the shell process
409-
* that launched the postmaster (and, we trust, is waiting for it to exit).
410-
* So the PID is usable for "is the postmaster still running" checks,
411-
* but cannot be compared directly to postmaster.pid.
412-
*
413-
* On Windows, we also save aside a handle to the shell process in
414-
* "postmasterProcess", which the caller should close when done with it.
415-
*/
416-
static pgpid_t
404+
static int
417405
start_postmaster(void)
418406
{
419407
char cmd[MAXPGPATH];
420408

421409
#ifndef WIN32
422-
pgpid_t pm_pid;
423-
424-
/* Flush stdio channels just before fork, to avoid double-output problems */
425-
fflush(stdout);
426-
fflush(stderr);
427-
428-
pm_pid = fork();
429-
if (pm_pid < 0)
430-
{
431-
/* fork failed */
432-
write_stderr(_("%s: could not start server: %s\n"),
433-
progname, strerror(errno));
434-
exit(1);
435-
}
436-
if (pm_pid > 0)
437-
{
438-
/* fork succeeded, in parent */
439-
return pm_pid;
440-
}
441-
442-
/* fork succeeded, in child */
443410

444411
/*
445412
* Since there might be quotes to handle here, it is easier simply to pass
446-
* everything to a shell to process them. Use exec so that the postmaster
447-
* has the same PID as the current child process.
413+
* everything to a shell to process them.
414+
*
415+
* XXX it would be better to fork and exec so that we would know the child
416+
* postmaster's PID directly; then test_postmaster_connection could use
417+
* the PID without having to rely on reading it back from the pidfile.
448418
*/
449419
if (log_file != NULL)
450-
snprintf(cmd, MAXPGPATH, "exec \"%s\" %s%s < \"%s\" >> \"%s\" 2>&1",
420+
snprintf(cmd, MAXPGPATH, SYSTEMQUOTE "\"%s\" %s%s < \"%s\" >> \"%s\" 2>&1 &" SYSTEMQUOTE,
451421
exec_path, pgdata_opt, post_opts,
452422
DEVNULL, log_file);
453423
else
454-
snprintf(cmd, MAXPGPATH, "exec \"%s\" %s%s < \"%s\" 2>&1",
424+
snprintf(cmd, MAXPGPATH, SYSTEMQUOTE "\"%s\" %s%s < \"%s\" 2>&1 &" SYSTEMQUOTE,
455425
exec_path, pgdata_opt, post_opts, DEVNULL);
456426

457-
(void) execl("/bin/sh", "/bin/sh", "-c", cmd, (char *) NULL);
458-
459-
/* exec failed */
460-
write_stderr(_("%s: could not start server: %s\n"),
461-
progname, strerror(errno));
462-
exit(1);
463-
464-
return 0; /* keep dumb compilers quiet */
465-
427+
return system(cmd);
466428
#else /* WIN32 */
467429

468430
/*
469-
* As with the Unix case, it's easiest to use the shell (CMD.EXE) to
470-
* handle redirection etc. Unfortunately CMD.EXE lacks any equivalent of
471-
* "exec", so we don't get to find out the postmaster's PID immediately.
431+
* On win32 we don't use system(). So we don't need to use & (which would
432+
* be START /B on win32). However, we still call the shell (CMD.EXE) with
433+
* it to handle redirection etc.
472434
*/
473435
PROCESS_INFORMATION pi;
474436

@@ -480,15 +442,10 @@ start_postmaster(void)
480442
exec_path, pgdata_opt, post_opts, DEVNULL);
481443

482444
if (!CreateRestrictedProcess(cmd, &pi, false))
483-
{
484-
write_stderr(_("%s: could not start server: error code %lu\n"),
485-
progname, (unsigned long) GetLastError());
486-
exit(1);
487-
}
488-
/* Don't close command process handle here; caller must do so */
489-
postmasterProcess = pi.hProcess;
445+
return GetLastError();
446+
CloseHandle(pi.hProcess);
490447
CloseHandle(pi.hThread);
491-
return pi.dwProcessId; /* Shell's PID, not postmaster's! */
448+
return 0;
492449
#endif /* WIN32 */
493450
}
494451

@@ -497,21 +454,15 @@ start_postmaster(void)
497454
/*
498455
* Find the pgport and try a connection
499456
*
500-
* On Unix, pm_pid is the PID of the just-launched postmaster. On Windows,
501-
* it may be the PID of an ancestor shell process, so we can't check the
502-
* contents of postmaster.pid quite as carefully.
503-
*
504-
* On Windows, the static variable postmasterProcess is an implicit argument
505-
* to this routine; it contains a handle to the postmaster process or an
506-
* ancestor shell process thereof.
507-
*
508457
* Note that the checkpoint parameter enables a Windows service control
509458
* manager checkpoint, it's got nothing to do with database checkpoints!!
510459
*/
511460
static PGPing
512-
test_postmaster_connection(pgpid_t pm_pid, bool do_checkpoint)
461+
test_postmaster_connection(bool do_checkpoint)
513462
{
514463
PGPing ret = PQPING_NO_RESPONSE;
464+
bool found_stale_pidfile = false;
465+
pgpid_t pm_pid = 0;
515466
char connstr[MAXPGPATH * 2 + 256];
516467
int i;
517468

@@ -566,27 +517,29 @@ test_postmaster_connection(pgpid_t pm_pid, bool do_checkpoint)
566517
optlines[5] != NULL)
567518
{
568519
/* File is complete enough for us, parse it */
569-
pgpid_t pmpid;
520+
long pmpid;
570521
time_t pmstart;
571522

572523
/*
573-
* Make sanity checks. If it's for the wrong PID, or the
574-
* recorded start time is before pg_ctl started, then
575-
* either we are looking at the wrong data directory, or
576-
* this is a pre-existing pidfile that hasn't (yet?) been
577-
* overwritten by our child postmaster. Allow 2 seconds
578-
* slop for possible cross-process clock skew.
524+
* Make sanity checks. If it's for a standalone backend
525+
* (negative PID), or the recorded start time is before
526+
* pg_ctl started, then either we are looking at the wrong
527+
* data directory, or this is a pre-existing pidfile that
528+
* hasn't (yet?) been overwritten by our child postmaster.
529+
* Allow 2 seconds slop for possible cross-process clock
530+
* skew.
579531
*/
580532
pmpid = atol(optlines[LOCK_FILE_LINE_PID - 1]);
581533
pmstart = atol(optlines[LOCK_FILE_LINE_START_TIME - 1]);
582-
if (pmstart >= start_time - 2 &&
583-
#ifndef WIN32
584-
pmpid == pm_pid
585-
#else
586-
/* Windows can only reject standalone-backend PIDs */
587-
pmpid > 0
588-
#endif
589-
)
534+
if (pmpid <= 0 || pmstart < start_time - 2)
535+
{
536+
/*
537+
* Set flag to report stale pidfile if it doesn't get
538+
* overwritten before we give up waiting.
539+
*/
540+
found_stale_pidfile = true;
541+
}
542+
else
590543
{
591544
/*
592545
* OK, seems to be a valid pidfile from our child.
@@ -596,6 +549,9 @@ test_postmaster_connection(pgpid_t pm_pid, bool do_checkpoint)
596549
char *hostaddr;
597550
char host_str[MAXPGPATH];
598551

552+
found_stale_pidfile = false;
553+
pm_pid = (pgpid_t) pmpid;
554+
599555
/*
600556
* Extract port number and host string to use. Prefer
601557
* using Unix socket if available.
@@ -667,23 +623,37 @@ test_postmaster_connection(pgpid_t pm_pid, bool do_checkpoint)
667623
}
668624

669625
/*
670-
* Check whether the child postmaster process is still alive. This
671-
* lets us exit early if the postmaster fails during startup.
672-
*
673-
* On Windows, we may be checking the postmaster's parent shell, but
674-
* that's fine for this purpose.
626+
* The postmaster should create postmaster.pid very soon after being
627+
* started. If it's not there after we've waited 5 or more seconds,
628+
* assume startup failed and give up waiting. (Note this covers both
629+
* cases where the pidfile was never created, and where it was created
630+
* and then removed during postmaster exit.) Also, if there *is* a
631+
* file there but it appears stale, issue a suitable warning and give
632+
* up waiting.
675633
*/
676-
#ifndef WIN32
634+
if (i >= 5)
677635
{
678-
int exitstatus;
636+
struct stat statbuf;
637+
638+
if (stat(pid_file, &statbuf) != 0)
639+
return PQPING_NO_RESPONSE;
679640

680-
if (waitpid((pid_t) pm_pid, &exitstatus, WNOHANG) == (pid_t) pm_pid)
641+
if (found_stale_pidfile)
642+
{
643+
write_stderr(_("\n%s: this data directory appears to be running a pre-existing postmaster\n"),
644+
progname);
681645
return PQPING_NO_RESPONSE;
646+
}
682647
}
683-
#else
684-
if (WaitForSingleObject(postmasterProcess, 0) == WAIT_OBJECT_0)
648+
649+
/*
650+
* If we've been able to identify the child postmaster's PID, check
651+
* the process is still alive. This covers cases where the postmaster
652+
* successfully created the pidfile but then crashed without removing
653+
* it.
654+
*/
655+
if (pm_pid > 0 && !postmaster_is_alive((pid_t) pm_pid))
685656
return PQPING_NO_RESPONSE;
686-
#endif
687657

688658
/* No response, or startup still in process; wait */
689659
#if defined(WIN32)
@@ -846,7 +816,7 @@ static void
846816
do_start(void)
847817
{
848818
pgpid_t old_pid = 0;
849-
pgpid_t pm_pid;
819+
int exitcode;
850820

851821
if (ctl_command != RESTART_COMMAND)
852822
{
@@ -886,13 +856,19 @@ do_start(void)
886856
}
887857
#endif
888858

889-
pm_pid = start_postmaster();
859+
exitcode = start_postmaster();
860+
if (exitcode != 0)
861+
{
862+
write_stderr(_("%s: could not start server: exit code was %d\n"),
863+
progname, exitcode);
864+
exit(1);
865+
}
890866

891867
if (do_wait)
892868
{
893869
print_msg(_("waiting for server to start..."));
894870

895-
switch (test_postmaster_connection(pm_pid, false))
871+
switch (test_postmaster_connection(false))
896872
{
897873
case PQPING_OK:
898874
print_msg(_(" done\n"));
@@ -918,12 +894,6 @@ do_start(void)
918894
}
919895
else
920896
print_msg(_("server starting\n"));
921-
922-
#ifdef WIN32
923-
/* Now we don't need the handle to the shell process anymore */
924-
CloseHandle(postmasterProcess);
925-
postmasterProcess = INVALID_HANDLE_VALUE;
926-
#endif
927897
}
928898

929899

@@ -1528,7 +1498,7 @@ pgwin32_ServiceMain(DWORD argc, LPTSTR *argv)
15281498
if (do_wait)
15291499
{
15301500
write_eventlog(EVENTLOG_INFORMATION_TYPE, _("Waiting for server startup...\n"));
1531-
if (test_postmaster_connection(postmasterPID, true) != PQPING_OK)
1501+
if (test_postmaster_connection(true) != PQPING_OK)
15321502
{
15331503
write_eventlog(EVENTLOG_ERROR_TYPE, _("Timed out waiting for server startup\n"));
15341504
pgwin32_SetServiceStatus(SERVICE_STOPPED);
@@ -1555,9 +1525,10 @@ pgwin32_ServiceMain(DWORD argc, LPTSTR *argv)
15551525
{
15561526
/*
15571527
* status.dwCheckPoint can be incremented by
1558-
* test_postmaster_connection(), so it might not start from 0.
1528+
* test_postmaster_connection(true), so it might not
1529+
* start from 0.
15591530
*/
1560-
int maxShutdownCheckPoint = status.dwCheckPoint + 12;
1531+
int maxShutdownCheckPoint = status.dwCheckPoint + 12;;
15611532

15621533
kill(postmasterPID, SIGINT);
15631534

0 commit comments

Comments
 (0)
0