8000 Add test case for obsoleting slot with active walsender · postgrespro/postgres@0912698 · GitHub
[go: up one dir, main page]

Skip to content
  • Commit 0912698

    Browse files
    committed
    Add test case for obsoleting slot with active walsender
    The code to signal a running walsender when its reserved WAL size grows too large is completely uncovered before this commit; this adds coverage for that case. This test involves sending SIGSTOP to walsender and walreceiver and running a checkpoint while advancing WAL, then sending SIGCONT. There's no precedent for this coding in Perl tests, and my reading of relevant manpages says it's likely to fail on Windows. Because of this, this test is always skipped on that platform. Author: Álvaro Herrera <alvherre@alvh.no-ip.org> Discussion: https://postgr.es/m/202106102202.mjw4huiix7lo@alvherre.pgsql
    1 parent d21fca0 commit 0912698

    File tree

    1 file changed

    +82
    -3
    lines changed

    1 file changed

    +82
    -3
    lines changed

    src/test/recovery/t/019_replslot_limit.pl

    Lines changed: 82 additions & 3 deletions
    Original file line numberDiff line numberDiff line change
    @@ -11,7 +11,7 @@
    1111
    use PostgresNode;
    1212

    1313
    use File::Path qw(rmtree);
    14-
    use Test::More tests => 14;
    14+
    use Test::More tests => $TestLib::windows_os ? 14 : 18;
    1515
    use Time::HiRes qw(usleep);
    1616

    1717
    $ENV{PGDATABASE} = 'postgres';
    @@ -211,8 +211,8 @@
    211211
    }
    212212
    ok($failed, 'check that replication has been broken');
    213213

    214-
    $node_primary->stop('immediate');
    215-
    $node_standby->stop('immediate');
    214+
    $node_primary->stop;
    215+
    $node_standby->stop;
    216216

    217217
    my $node_primary2 = get_new_node('primary2');
    218218
    $node_primary2->init(allows_streaming => 1);
    @@ -253,6 +253,85 @@
    253253
    timeout => '60'));
    254254
    is($result[1], 'finished', 'check if checkpoint command is not blocked');
    255255

    256+
    $node_primary2->stop;
    257+
    $node_standby->stop;
    258+
    259+
    # The next test depends on Perl's `kill`, which apparently is not
    260+
    # portable to Windows. (It would be nice to use Test::More's `subtest`,
    261+
    # but that's not in the ancient version we require.)
    262+
    if ($TestLib::windows_os)
    263+
    {
    264+
    done_testing();
    265+
    exit;
    266+
    }
    267+
    268+
    # Get a slot terminated while the walsender is active
    269+
    # We do this by sending SIGSTOP to the walsender. Skip this on Windows.
    270+
    my $node_primary3 = get_new_node('primary3');
    271+
    $node_primary3->init(allows_streaming => 1, extra => ['--wal-segsize=1']);
    272+
    $node_primary3->append_conf(
    273+
    'postgresql.conf', qq(
    274+
    min_wal_size = 2MB
    275+
    max_wal_size = 2MB
    276+
    log_checkpoints = yes
    277+
    max_slot_wal_keep_size = 1MB
    278+
    ));
    279+
    $node_primary3->start;
    280+
    $node_primary3->safe_psql('postgres',
    281+
    "SELECT pg_create_physical_replication_slot('rep3')");
    282+
    # Take backup
    283+
    $backup_name = 'my_backup';
    284+
    $node_primary3->backup($backup_name);
    285+
    # Create standby
    286+
    my $node_standby3 = get_new_node('standby_3');
    287+
    $node_standby3->init_from_backup($node_primary3, $backup_name,
    288+
    has_streaming => 1);
    289+
    $node_standby3->append_conf('postgresql.conf', "primary_slot_name = 'rep3'");
    290+
    $node_standby3->start;
    291+
    $node_primary3->wait_for_catchup($node_standby3->name, 'replay');
    292+
    my $senderpid = $node_primary3->safe_psql('postgres',
    293+
    "SELECT pid FROM pg_stat_activity WHERE backend_type = 'walsender'");
    294+
    like($senderpid, qr/^[0-9]+$/, "have walsender pid $senderpid");
    295+
    my $receiverpid = $node_standby3->safe_psql('postgres',
    296+
    "SELECT pid FROM pg_stat_activity WHERE backend_type = 'walreceiver'");
    297+
    like($receiverpid, qr/^[0-9]+$/, "have walreceiver pid $receiverpid");
    298+
    299+
    # freeze walsender and walreceiver. Slot will still be active, but walreceiver
    300+
    # won't get anything anymore.
    301+
    kill 'STOP', $senderpid, $receiverpid;
    302+
    $logstart = get_log_size($node_primary3);
    303+
    advance_wal($node_primary3, 4);
    304+
    ok(find_in_log($node_primary3, "to release replication slot", $logstart),
    305+
    "walreceiver termination logged");
    306+
    307+
    # Now let the walsender continue; slot should be killed now.
    308+
    # (Must not let walreceiver run yet; otherwise the standby could start another
    309+
    # one before the slot can be killed)
    310+
    kill 'CONT', $senderpid;
    311+
    $node_primary3->poll_query_until('postgres',
    312+
    "SELECT wal_status FROM pg_replication_slots WHERE slot_name = 'rep3'",
    313+
    "lost")
    314+
    or die "timed out waiting for slot to be lost";
    315+
    316+
    my $max_attempts = 180;
    317+
    while ($max_attempts-- > 0)
    318+
    {
    319+
    if (find_in_log(
    320+
    $node_primary3,
    321+
    'invalidating slot "rep3" because its restart_lsn', $logstart))
    322+
    {
    323+
    ok(1, "slot invalidation logged"); 8483
    324+
    last;
    325+
    }
    326+
    sleep 1;
    327+
    }
    328+
    329+
    # Now let the walreceiver continue, so that the node can be stopped cleanly
    330+
    kill 'CONT', $receiverpid;
    331+
    332+
    $node_primary3->stop;
    333+
    $node_standby3->stop;
    334+
    256335
    #####################################
    257336
    # Advance WAL of $node by $n segments
    258337
    sub advance_wal

    0 commit comments

    Comments
     (0)
    0