@@ -83,6 +83,12 @@ static int64 sendDir(bbsink *sink, const char *path, int basepathlen, bool sizeo
8383static bool sendFile (bbsink * sink , const char * readfilename , const char * tarfilename ,
8484 struct stat * statbuf , bool missing_ok , Oid dboid ,
8585 backup_manifest_info * manifest , const char * spcoid );
86+ static off_t read_file_data_into_buffer (bbsink * sink ,
87+ const char * readfilename , int fd ,
88+ off_t offset , size_t length ,
89+ BlockNumber blkno ,
90+ bool verify_checksum ,
91+ int * checksum_failures );
8692static bool verify_page_checksum (Page page , XLogRecPtr start_lsn ,
8793 BlockNumber blkno ,
8894 uint16 * expected_checksum );
@@ -1490,9 +1496,7 @@ sendFile(bbsink *sink, const char *readfilename, const char *tarfilename,
14901496 BlockNumber blkno = 0 ;
14911497 int checksum_failures = 0 ;
14921498 off_t cnt ;
1493- int i ;
1494- pgoff_t len = 0 ;
1495- char * page ;
1499+ pgoff_t bytes_done = 0 ;
14961500 int segmentno = 0 ;
14971501 char * segmentpath ;
14981502 bool verify_checksum = false;
@@ -1514,6 +1518,12 @@ sendFile(bbsink *sink, const char *readfilename, const char *tarfilename,
15141518
15151519 _tarWriteHeader (sink , tarfilename , NULL , statbuf , false);
15161520
1521+ /*
1522+ * Checksums are verified in multiples of BLCKSZ, so the buffer length
1523+ * should be a multiple of the block size as well.
1524+ */
1525+ Assert ((sink -> bbs_buffer_length % BLCKSZ ) == 0 );
1526+
15171527 if (!noverify_checksums && DataChecksumsEnabled ())
15181528 {
15191529 char * filename ;
@@ -1551,23 +1561,21 @@ sendFile(bbsink *sink, const char *readfilename, const char *tarfilename,
15511561 * for a base backup we can ignore such extended data. It will be restored
15521562 * from WAL.
15531563 */
1554- while (len < statbuf -> st_size )
1564+ while (bytes_done < statbuf -> st_size )
15551565 {
1556- size_t remaining = statbuf -> st_size - len ;
1566+ size_t remaining = statbuf -> st_size - bytes_done ;
15571567
15581568 /* Try to read some more data. */
1559- cnt = basebackup_read_file (fd , sink -> bbs_buffer ,
1560- Min (sink -> bbs_buffer_length , remaining ),
1561- len , readfilename , true);
1569+ cnt = read_file_data_into_buffer (sink , readfilename , fd , bytes_done ,
1570+ remaining ,
1571+ blkno + segmentno * RELSEG_SIZE ,
1572+ verify_checksum ,
1573+ & checksum_failures );
15621574
15631575 /*
1564- * The checksums are verified at block level, so we iterate over the
1565- * buffer in chunks of BLCKSZ, after making sure that
1566- * TAR_SEND_SIZE/buf is divisible by BLCKSZ and we read a multiple of
1567- * BLCKSZ bytes.
1576+ * If the amount of data we were able to read was not a multiple of
1577+ * BLCKSZ, we cannot verify checksums, which are block-level.
15681578 */
1569- Assert ((sink -> bbs_buffer_length % BLCKSZ ) == 0 );
1570-
15711579 if (verify_checksum && (cnt % BLCKSZ != 0 ))
15721580 {
15731581 ereport (WARNING ,
@@ -1578,84 +1586,6 @@ sendFile(bbsink *sink, const char *readfilename, const char *tarfilename,
15781586 verify_checksum = false;
15791587 }
15801588
1581- if (verify_checksum )
1582- {
1583- for (i = 0 ; i < cnt / BLCKSZ ; i ++ )
1584- {
1585- int reread_cnt ;
1586- uint16 expected_checksum ;
1587-
1588- page = sink -> bbs_buffer + BLCKSZ * i ;
1589-
1590- /* If the page is OK, go on to the next one. */
1591- if (verify_page_checksum (page , sink -> bbs_state -> startptr ,
1592- blkno + i + segmentno * RELSEG_SIZE ,
1593- & expected_checksum ))
1594- continue ;
1595-
1596- /*
1597- * Retry the block on the first failure. It's possible that
1598- * we read the first 4K page of the block just before postgres
1599- * updated the entire block so it ends up looking torn to us.
1600- * If, before we retry the read, the concurrent write of the
1601- * block finishes, the page LSN will be updated and we'll
1602- * realize that we should ignore this block.
1603- *
1604- * There's no guarantee that this will actually happen,
1605- * though: the torn write could take an arbitrarily long time
1606- * to complete. Retrying multiple times wouldn't fix this
1607- * problem, either, though it would reduce the chances of it
1608- * happening in practice. The only real fix here seems to be
1609- * to have some kind of interlock that allows us to wait until
1610- * we can be certain that no write to the block is in
1611- * progress. Since we don't have any such thing right now, we
1612- * just do this and hope for the best.
1613- */
1614- reread_cnt =
1615- basebackup_read_file (fd ,
1616- sink -> bbs_buffer + BLCKSZ * i ,
1617- BLCKSZ , len + BLCKSZ * i ,
1618- readfilename ,
1619- false);
1620- if (reread_cnt == 0 )
1621- {
1622- /*
1623- * If we hit end-of-file, a concurrent truncation must
1624- * have occurred, so break out of this loop just as if the
1625- * initial fread() returned 0. We'll drop through to the
1626- * same code that handles that case. (We must fix up cnt
1627- * first, though.)
1628- */
1629- cnt = BLCKSZ * i ;
1630- break ;
1631- }
1632-
1633- /* If the page now looks OK, go on to the next one. */
1634- if (verify_page_checksum (page , sink -> bbs_state -> startptr ,
1635- blkno + i + segmentno * RELSEG_SIZE ,
1636- & expected_checksum ))
1637- continue ;
1638-
1639- /* Handle checksum failure. */
1640- checksum_failures ++ ;
1641- if (checksum_failures <= 5 )
1642- ereport (WARNING ,
1643- (errmsg ("checksum verification failed in "
1644- "file \"%s\", block %u: calculated "
1645- "%X but expected %X" ,
1646- readfilename , blkno + i , expected_checksum ,
1647- ((PageHeader ) page )-> pd_checksum )));
1648- if (checksum_failures == 5 )
1649- ereport (WARNING ,
1650- (errmsg ("further checksum verification "
1651- "failures in file \"%s\" will not "
1652- "be reported" , readfilename )));
1653- }
1654-
1655- /* Update block number for next pass through the outer loop. */
1656- blkno += i ;
1657- }
1658-
16591589 /*
16601590 * If we hit end-of-file, a concurrent truncation must have occurred.
16611591 * That's not an error condition, because WAL replay will fix things
@@ -1664,21 +1594,23 @@ sendFile(bbsink *sink, const char *readfilename, const char *tarfilename,
16641594 if (cnt == 0 )
16651595 break ;
16661596
1597+ /* Update block number and # of bytes done for next loop iteration. */
1598+ blkno += cnt / BLCKSZ ;
1599+ bytes_done += cnt ;
1600+
16671601 /* Archive the data we just read. */
16681602 bbsink_archive_contents (sink , cnt );
16691603
16701604 /* Also feed it to the checksum machinery. */
16711605 if (pg_checksum_update (& checksum_ctx ,
16721606 (uint8 * ) sink -> bbs_buffer , cnt ) < 0 )
16731607 elog (ERROR , "could not update checksum of base backup" );
1674-
1675- len += cnt ;
16761608 }
16771609
16781610 /* If the file was truncated while we were sending it, pad it with zeros */
1679- while (len < statbuf -> st_size )
1611+ while (bytes_done < statbuf -> st_size )
16801612 {
1681- size_t remaining = statbuf -> st_size - len ;
1613+ size_t remaining = statbuf -> st_size - bytes_done ;
16821614 size_t nbytes = Min (sink -> bbs_buffer_length , remaining );
16831615
16841616 MemSet (sink -> bbs_buffer , 0 , nbytes );
@@ -1687,15 +1619,15 @@ sendFile(bbsink *sink, const char *readfilename, const char *tarfilename,
16871619 nbytes ) < 0 )
16881620 elog (ERROR , "could not update checksum of base backup" );
16891621 bbsink_archive_contents (sink , nbytes );
1690- len += nbytes ;
1622+ bytes_done += nbytes ;
16911623 }
16921624
16931625 /*
16941626 * Pad to a block boundary, per tar format requirements. (This small piece
16951627 * of data is probably not worth throttling, and is not checksummed
16961628 * because it's not actually part of the file.)
16971629 */
1698- _tarWritePadding (sink , len );
1630+ _tarWritePadding (sink , bytes_done );
16991631
17001632 CloseTransientFile (fd );
17011633
@@ -1718,6 +1650,109 @@ sendFile(bbsink *sink, const char *readfilename, const char *tarfilename,
17181650 return true;
17191651}
17201652
1653+ /*
1654+ * Read some more data from the file into the bbsink's buffer, verifying
1655+ * checksums as required.
1656+ *
1657+ * 'offset' is the file offset from which we should begin to read, and
1658+ * 'length' is the amount of data that should be read. The actual amount
1659+ * of data read will be less than the requested amount if the bbsink's
1660+ * buffer isn't big enough to hold it all, or if the underlying file has
1661+ * been truncated. The return value is the number of bytes actually read.
1662+ *
1663+ * 'blkno' is the block number of the first page in the bbsink's buffer
1664+ * relative to the start of the relation.
1665+ *
1666+ * 'verify_checksum' indicates whether we should try to verify checksums
1667+ * for the blocks we read. If we do this, we'll update *checksum_failures
1668+ * and issue warnings as appropriate.
1669+ */
1670+ static off_t
1671+ read_file_data_into_buffer (bbsink * sink , const char * readfilename , int fd ,
1672+ off_t offset , size_t length , BlockNumber blkno ,
1673+ bool verify_checksum , int * checksum_failures )
1674+ {
1675+ off_t cnt ;
1676+ int i ;
1677+ char * page ;
1678+
1679+ /* Try to read some more data. */
1680+ cnt = basebackup_read_file (fd , sink -> bbs_buffer ,
1681+ Min (sink -> bbs_buffer_length , length ),
1682+ offset , readfilename , true);
1683+
1684+ /* Can't verify checksums if read length is not a multiple of BLCKSZ. */
1685+ if (!verify_checksum || (cnt % BLCKSZ ) != 0 )
1686+ return cnt ;
1687+
1688+ /* Verify checksum for each block. */
1689+ for (i = 0 ; i < cnt / BLCKSZ ; i ++ )
1690+ {
1691+ int reread_cnt ;
1692+ uint16 expected_checksum ;
1693+
1694+ page = sink -> bbs_buffer + BLCKSZ * i ;
1695+
1696+ /* If the page is OK, go on to the next one. */
1697+ if (verify_page_checksum (page , sink -> bbs_state -> startptr , blkno + i ,
1698+ & expected_checksum ))
1699+ continue ;
1700+
1701+ /*
1702+ * Retry the block on the first failure. It's possible that we read
1703+ * the first 4K page of the block just before postgres updated the
1704+ * entire block so it ends up looking torn to us. If, before we retry
1705+ * the read, the concurrent write of the block finishes, the page LSN
1706+ * will be updated and we'll realize that we should ignore this block.
1707+ *
1708+ * There's no guarantee that this will actually happen, though: the
1709+ * torn write could take an arbitrarily long time to complete.
1710+ * Retrying multiple times wouldn't fix this problem, either, though
1711+ * it would reduce the
CDB3
chances of it happening in practice. The only
1712+ * real fix here seems to be to have some kind of interlock that
1713+ * allows us to wait until we can be certain that no write to the
1714+ * block is in progress. Since we don't have any such thing right now,
1715+ * we just do this and hope for the best.
1716+ */
1717+ reread_cnt =
1718+ basebackup_read_file (fd , sink -> bbs_buffer + BLCKSZ * i ,
1719+ BLCKSZ , offset + BLCKSZ * i ,
1720+ readfilename , false);
1721+ if (reread_cnt == 0 )
1722+ {
1723+ /*
1724+ * If we hit end-of-file, a concurrent truncation must have
1725+ * occurred, so reduce cnt to reflect only the blocks already
1726+ * processed and break out of this loop.
1727+ */
1728+ cnt = BLCKSZ * i ;
1729+ break ;
1730+ }
1731+
1732+ /* If the page now looks OK, go on to the next one. */
1733+ if (verify_page_checksum (page , sink -> bbs_state -> startptr , blkno + i ,
1734+ & expected_checksum ))
1735+ continue ;
1736+
1737+ /* Handle checksum failure. */
1738+ (* checksum_failures )++ ;
1739+ if (* checksum_failures <= 5 )
1740+ ereport (WARNING ,
1741+ (errmsg ("checksum verification failed in "
1742+ "file \"%s\", block %u: calculated "
1743+ "%X but expected %X" ,
1744+ readfilename , blkno + i , expected_checksum ,
1745+ ((PageHeader ) page )-> pd_checksum )));
1746+ if (* checksum_failures == 5 )
1747+ ereport (WARNING ,
1748+ (errmsg ("further checksum verification "
1749+ "failures in file \"%s\" will not "
1750+ "be reported" , readfilename )));
1751+ }
1752+
1753+ return cnt ;
1754+ }
1755+
17211756/*
17221757 * Try to verify the checksum for the provided page, if it seems appropriate
17231758 * to do so.
0 commit comments