8000 Fix pg_rewind bugs when rewinding a standby server. · postgrespro/postgres@2b4f313 · GitHub
[go: up one dir, main page]

Skip to content

Commit 2b4f313

Browse files
committed
Fix pg_rewind bugs when rewinding a standby server.
If the target is a standby server, its WAL doesn't end at the last checkpoint record, but at minRecoveryPoint. We must scan all the WAL from the last common checkpoint all the way up to minRecoveryPoint for modified pages, and also consider that portion when determining whether the server needs rewinding. Backpatch to all supported versions. Author: Ian Barwick and me Discussion: https://www.postgresql.org/message-id/CABvVfJU-LDWvoz4-Yow3Ay5LZYTuPD7eSjjE4kGyNZpXC6FrVQ%40mail.gmail.com
1 parent 6114040 commit 2b4f313

File tree

3 files changed

+200
-23
lines changed
  • t
  • 3 files changed

    +200
    -23
    lines changed

    src/bin/pg_rewind/parsexlog.c

    Lines changed: 10 additions & 1 deletion
    Original file line numberDiff line numberDiff line change
    @@ -55,6 +55,9 @@ static int SimpleXLogPageRead(XLogReaderState *xlogreader,
    5555
    * Read WAL from the datadir/pg_wal, starting from 'startpoint' on timeline
    5656
    * index 'tliIndex' in target timeline history, until 'endpoint'. Make note of
    5757
    * the data blocks touched by the WAL records, and return them in a page map.
    58+
    *
    59+
    * 'endpoint' is the end of the last record to read. The record starting at
    60+
    * 'endpoint' is the first one that is not read.
    5861
    */
    5962
    void
    6063
    extractPageMap(const char *datadir, XLogRecPtr startpoint, int tliIndex,
    @@ -93,7 +96,13 @@ extractPageMap(const char *datadir, XLogRecPtr startpoint, int tliIndex,
    9396

    9497
    extractPageInfo(xlogreader);
    9598

    96-
    } while (xlogreader->ReadRecPtr != endpoint);
    99+
    } while (xlogreader->EndRecPtr < endpoint);
    100+
    101+
    /*
    102+
    * If 'endpoint' didn't point exactly at a record boundary, the caller
    103+
    * messed up.
    104+
    */
    105+
    Assert(xlogreader->EndRecPtr == endpoint);
    97106

    98107
    XLogReaderFree(xlogreader);
    99108
    if (xlogreadfd != -1)

    src/bin/pg_rewind/pg_rewind.c

    Lines changed: 34 additions & 22 deletions
    Original file line numberDiff line numberDiff line change
    @@ -129,6 +129,7 @@ main(int argc, char **argv)
    129129
    XLogRecPtr chkptrec;
    130130
    TimeLineID chkpttli;
    131131
    XLogRecPtr chkptredo;
    132+
    XLogRecPtr target_wal_endrec;
    132133
    size_t size;
    133134
    char *buffer;
    134135
    bool no_ensure_shutdown = false;
    @@ -335,43 +336,56 @@ main(int argc, char **argv)
    335336
    {
    336337
    pg_log_info("source and target cluster are on the same timeline");
    337338
    rewind_needed = false;
    339+
    target_wal_endrec = 0;
    338340
    }
    339341
    else
    340342
    {
    343+
    XLogRecPtr chkptendrec;
    344+
    341345
    findCommonAncestorTimeline(&divergerec, &lastcommontliIndex);
    342346
    pg_log_info("servers diverged at WAL location %X/%X on timeline %u",
    343347
    (uint32) (divergerec >> 32), (uint32) divergerec,
    344348
    targetHistory[lastcommontliIndex].tli);
    345349

    350+
    /*
    351+
    * Determine the end-of-WAL on the target.
    352+
    *
    353+
    * The WAL ends at the last shutdown checkpoint, or at
    354+
    * minRecoveryPoint if it was a standby. (If we supported rewinding a
    355+
    * server that was not shut down cleanly, we would need to replay
    356+
    * until we reach the first invalid record, like crash recovery does.)
    357+
    */
    358+
    359+
    /* read the checkpoint record on the target to see where it ends. */
    360+
    chkptendrec = readOneRecord(datadir_target,
    361+
    ControlFile_target.checkPoint,
    362+
    targetNentries - 1,
    363+
    restore_command);
    364+
    365+
    if (ControlFile_target.minRecoveryPoint > chkptendrec)
    366+
    {
    367+
    target_wal_endrec = ControlFile_target.minRecoveryPoint;
    368+
    }
    369+
    else
    370+
    {
    371+
    target_wal_endrec = chkptendrec;
    372+
    }
    373+
    346374
    /*
    347375
    * Check for the possibility that the target is in fact a direct
    348376
    * ancestor of the source. In that case, there is no divergent history
    349377
    * in the target that needs rewinding.
    350378
    */
    351-
    if (ControlFile_target.checkPoint >= divergerec)
    379+
    if (target_wal_endrec > divergerec)
    352380
    {
    353381
    rewind_needed = true;
    354382
    }
    355383
    else
    356384
    {
    357-
    XLogRecPtr chkptendrec;
    385+
    /* the last common checkpoint record must be part of target WAL */
    386+
    Assert(target_wal_endrec == divergerec);
    358387

    359-
    /* Read the checkpoint record on the target to see where it ends. */
    360-
    chkptendrec = readOneRecord(datadir_target,
    361-
    ControlFile_target.checkPoint,
    362-
    targetNentries - 1,
    363-
    restore_command);
    364-
    365-
    /*
    366-
    * If the histories diverged exactly at the end of the shutdown
    367-
    * checkpoint record on the target, there are no WAL records in
    368-
    * the target that don't belong in the source's history, and no
    369-
    * rewind is needed.
    370-
    */
    371-
    if (chkptendrec == divergerec)
    372-
    rewind_needed = false;
    373-
    else
    374-
    rewind_needed = true;
    388+
    rewind_needed = false;
    375389
    }
    376390
    }
    377391

    @@ -407,14 +421,12 @@ main(int argc, char **argv)
    407421
    /*
    408422
    * Read the target WAL from last checkpoint before the point of fork, to
    409423
    * extract all the pages that were modified on the target cluster after
    410-
    * the fork. We can stop reading after reaching the final shutdown record.
    411-
    * XXX: If we supported rewinding a server that was not shut down cleanly,
    412-
    * we would need to replay until the end of WAL here.
    424+
    * the fork.
    413425
    */
    414426
    if (showprogress)
    415427
    pg_log_info("reading WAL in target");
    416428
    extractPageMap(datadir_target, chkptrec, lastcommontliIndex,
    417-
    ControlFile_target.checkPoint, restore_command);
    429+
    target_wal_endrec, restore_command);
    418430

    419431
    /*
    420432
    * We have collected all information we need from both systems. Decide
    Lines changed: 156 additions & 0 deletions
    Original file line numberDiff line numberDiff line change
    @@ -0,0 +1,156 @@
    1+
    #
    2+
    # Test situation where a target data directory contains
    3+
    # WAL records beyond both the last checkpoint and the divergence
    4+
    # point:
    5+
    #
    6+
    # Target WAL (TLI 2):
    7+
    #
    8+
    # backup ... Checkpoint A ... INSERT 'rewind this'
    9+
    # (TLI 1 -> 2)
    10+
    #
    11+
    # ^ last common ^ minRecoveryPoint
    12+
    # checkpoint
    13+
    #
    14+
    # Source WAL (TLI 3):
    15+
    #
    16+
    # backup ... Checkpoint A ... Checkpoint B ... INSERT 'keep this'
    17+
    # (TLI 1 -> 2) (TLI 2 -> 3)
    18+
    #
    19+
    #
    20+
    # The last common checkpoint is Checkpoint A. But there is WAL on TLI 2
    21+
    # after the last common checkpoint that needs to be rewound. We used to
    22+
    # have a bug where minRecoveryPoint was ignored, and pg_rewind concluded
    23+
    # that the target doesn't need rewinding in this scenario, because the
    24+
    # last checkpoint on the target TLI was an ancestor of the source TLI.
    25+
    #
    26+
    #
    27+
    # This test does not make use of RewindTest as it requires three
    28+
    # nodes.
    29+
    30+
    use strict;
    31+
    use warnings;
    32+
    use PostgresNode;
    33+
    use TestLib;
    34+
    use Test::More tests => 3;
    35+
    36+
    use File::Copy;
    37+
    38+
    my $tmp_folder = TestLib::tempdir;
    39+
    40+
    my $node_1 = get_new_node('node_1');
    41+
    $node_1->init(allows_streaming => 1);
    42+
    $node_1->append_conf('postgresql.conf', qq(
    43+
    wal_keep_size='100 MB'
    44+
    ));
    45+
    46+
    $node_1->start;
    47+
    48+
    # Create a couple of test tables
    49+
    $node_1->safe_psql('postgres', 'CREATE TABLE public.foo (t TEXT)');
    50+
    $node_1->safe_psql('postgres', 'CREATE TABLE public.bar (t TEXT)');
    51+
    $node_1->safe_psql('postgres', "INSERT INTO public.bar VALUES ('in both')");
    52+
    53+
    54+
    # Take backup
    55+
    my $backup_name = 'my_backup';
    56+
    $node_1->backup($backup_name);
    57+
    58+
    # Create streaming standby from backup
    59+
    my $node_2 = get_new_node('node_2');
    60+
    $node_2->init_from_backup($node_1, $backup_name,
    61+
    has_streaming => 1);
    62+
    $node_2->start;
    63+
    64+
    # Create streaming standby from backup
    65+
    my $node_3 = get_new_node('node_3');
    66+
    $node_3->init_from_backup($node_1, $backup_name,
    67+
    has_streaming => 1);
    68+
    $node_3->start;
    69+
    70+
    # Stop node_1
    71+
    72+
    $node_1->stop('fast');
    73+
    74+
    # Promote node_3
    75+
    $node_3->promote;
    76+
    77+
    # node_1 rejoins node_3
    78+
    79+
    my $node_3_connstr = $node_3->connstr;
    80+
    81+
    $node_1->append_conf('postgresql.conf', qq(
    82+
    primary_conninfo='$node_3_connstr'
    83+
    ));
    84+
    $node_1->set_standby_mode();
    85+
    $node_1->start();
    86+
    87+
    # node_2 follows node_3
    88+
    89+
    $node_2->append_conf('postgresql.conf', qq(
    90+
    primary_conninfo='$node_3_connstr'
    91+
    ));
    92+
    $node_2->restart();
    93+
    94+
    # Promote node_1
    95+
    96+
    $node_1->promote;
    97+
    98+
    # We now have a split-brain with two primaries. Insert a row on both to
    99+
    # demonstratively create a split brain. After the rewind, we should only
    100+
    # see the insert on 1, as the insert on node 3 is rewound away.
    101+
    $node_1->safe_psql('postgres', "INSERT INTO public.foo (t) VALUES ('keep this')");
    102+
    103+
    # Insert more rows in node 1, to bump up the XID counter. Otherwise, if
    104+
    # rewind doesn't correctly rewind the changes made on the other node,
    105+
    # we might fail to notice if the inserts are invisible because the XIDs
    106+
    # are not marked as committed.
    107+
    $node_1->safe_psql('postgres', "INSERT INTO public.foo (t) VALUES ('and this')");
    108+
    $node_1->safe_psql('postgres', "INSERT INTO public.foo (t) VALUES ('and this too')");
    109+
    110+
    # Also insert a row in 'bar' on node 3. It is unmodified in node 1, so it won't get
    111+
    # overwritten by replaying the WAL from node 1.
    112+
    $node_3->safe_psql('postgres', "INSERT INTO public.bar (t) VALUES ('rewind this')");
    113+
    114+
    # Wait for node 2 to catch up
    115+
    $node_2->poll_query_until('postgres',
    116+
    q|SELECT COUNT(*) > 1 FROM public.bar|, 't');
    117+
    118+
    # At this point node_2 will shut down without a shutdown checkpoint,
    119+
    # but with WAL entries beyond the preceding shutdown checkpoint.
    120+
    $node_2->stop('fast');
    121+
    $node_3->stop('fast');
    122+
    123+
    my $node_2_pgdata = $node_2->data_dir;
    124+
    my $node_1_connstr = $node_1->connstr;
    125+
    126+
    # Keep a temporary postgresql.conf or it would be overwritten during the rewind.
    127+
    copy(
    128+
    "$node_2_pgdata/postgresql.conf",
    129+
    "$tmp_folder/node_2-postgresql.conf.tmp");
    130+
    131+
    command_ok(
    132+
    [
    133+
    'pg_rewind',
    134+
    "--source-server=$node_1_connstr",
    135+
    "--target-pgdata=$node_2_pgdata"
    136+
    ],
    137+
    'pg_rewind detects rewind needed');
    138+
    139+
    # Now move back postgresql.conf with old settings
    140+
    move(
    141+
    "$tmp_folder/node_2-postgresql.conf.tmp",
    142+
    "$node_2_pgdata/postgresql.conf");
    143+
    144+
    $node_2->start;
    145+
    146+
    # Check contents of the test tables after rewind. The rows inserted in node 3
    147+
    # before rewind should've been overwritten with the data from node 1.
    148+
    my $result;
    149+
    $result = $node_2->safe_psql('postgres', 'checkpoint');
    150+
    $result = $node_2->safe_psql('postgres', 'SELECT * FROM public.foo');
    151+
    is($result, qq(keep this
    152+
    and this
    153+
    and this too), 'table foo after rewind');
    154+
    155+
    $result = $node_2->safe_psql('postgres', 'SELECT * FROM public.bar');
    156+
    is($result, qq(in both), 'table bar after rewind');

    0 commit comments

    Comments
     (0)
    0