10000 tableam: relation creation, VACUUM FULL/CLUSTER, SET TABLESPACE. · postgrespro/postgres@d25f519 · GitHub
[go: up one dir, main page]

Skip to content

Commit d25f519

Browse files
committed
tableam: relation creation, VACUUM FULL/CLUSTER, SET TABLESPACE.
This moves the responsibility for: - creating the storage necessary for a relation, including creating a new relfilenode for a relation with existing storage - non-transactional truncation of a relation - VACUUM FULL / CLUSTER's rewrite of a table below tableam. This is fairly straight forward, with a bit of complexity smattered in to move the computation of xid / multixid horizons below the AM, as they don't make sense for every table AM. Author: Andres Freund Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de
1 parent 7e69323 commit d25f519

File tree

13 files changed

+856
-579
lines changed
  • utils
  • 13 files changed

    +856
    -579
    lines changed

    src/backend/access/heap/heapam_handler.c

    Lines changed: 451 additions & 0 deletions
    Large diffs are not rendered by default.

    src/backend/bootstrap/bootparse.y

    Lines changed: 6 additions & 1 deletion
    Original file line numberDiff line numberDiff line change
    @@ -209,6 +209,9 @@ Boot_CreateStmt:
    209209

    210210
    if ($4)
    211211
    {
    212+
    TransactionId relfrozenxid;
    213+
    MultiXactId relminmxid;
    214+
    212215
    if (boot_reldesc)
    213216
    {
    214217
    elog(DEBUG4, "create bootstrap: warning, open relation exists, closing first");
    @@ -226,7 +229,9 @@ Boot_CreateStmt:
    226229
    RELPERSISTENCE_PERMANENT,
    227230
    shared_relation,
    228231
    mapped_relation,
    229-
    true);
    232+
    true,
    233+
    &relfrozenxid,
    234+
    &relminmxid);
    230235
    elog(DEBUG4, "bootstrap relation created");
    231236
    }
    232237
    else

    src/backend/catalog/heap.c

    Lines changed: 49 additions & 71 deletions
    Original file line numberDiff line numberDiff line change
    @@ -35,6 +35,7 @@
    3535
    #include "access/relation.h"
    3636
    #include "access/sysattr.h"
    3737
    #include "access/table.h"
    38+
    #include "access/tableam.h"
    3839
    #include "access/transam.h"
    3940
    #include "access/xact.h"
    4041
    #include "access/xlog.h"
    @@ -98,6 +99,8 @@ static void AddNewRelationTuple(Relation pg_class_desc,
    9899
    Oid reloftype,
    99100
    Oid relowner,
    100101
    char relkind,
    102+
    TransactionId relfrozenxid,
    103+
    TransactionId relminmxid,
    101104
    Datum relacl,
    102105
    Datum reloptions);
    103106
    static ObjectAddress AddNewRelationType(const char *typeName,
    @@ -300,7 +303,9 @@ heap_create(const char *relname,
    300303
    char relpersistence,
    301304
    bool shared_relation,
    302305
    bool mapped_relation,
    303-
    bool allow_system_table_mods)
    306+
    bool allow_system_table_mods,
    307+
    TransactionId *relfrozenxid,
    308+
    MultiXactId *relminmxid)
    304309
    {
    305310
    bool create_storage;
    306311
    Relation rel;
    @@ -327,6 +332,9 @@ heap_create(const char *relname,
    327332
    get_namespace_name(relnamespace), relname),
    328333
    errdetail("System catalog modifications are currently disallowed.")));
    329334

    335+
    *relfrozenxid = InvalidTransactionId;
    336+
    *relminmxid = InvalidMultiXactId;
    337+
    330338
    /* Handle reltablespace for specific relkinds. */
    331339
    switch (relkind)
    332340
    {
    @@ -400,13 +408,36 @@ heap_create(const char *relname,
    400408
    /*
    401409
    * Have the storage manager create the relation's disk file, if needed.
    402410
    *
    403-
    * We only create the main fork here, other forks will be created on
    404-
    * demand.
    411+
    * For relations the callback creates both the main and the init fork, for
    412+
    * indexes only the main fork is created. The other forks will be created
    413+
    * on demand.
    405414
    */
    406415
    if (create_storage)
    407416
    {
    408417
    RelationOpenSmgr(rel);
    409-
    RelationCreateStorage(rel->rd_node, relpersistence);
    418+
    419+
    switch (rel->rd_rel->relkind)
    420+
    {
    421+
    case RELKIND_VIEW:
    422+
    case RELKIND_COMPOSITE_TYPE:
    423+
    case RELKIND_FOREIGN_TABLE:
    424+
    case RELKIND_PARTITIONED_TABLE:
    425+
    case RELKIND_PARTITIONED_INDEX:
    426+
    Assert(false);
    427+
    break;
    428+
    429+
    case RELKIND_INDEX:
    430+
    case RELKIND_SEQUENCE:
    431+
    RelationCreateStorage(rel->rd_node, relpersistence);
    432+
    break;
    433+
    434+
    case RELKIND_RELATION:
    435+
    case RELKIND_TOASTVALUE:
    436+
    case RELKIND_MATVIEW:
    437+
    table_relation_set_new_filenode(rel, relpersistence,
    438+
    relfrozenxid, relminmxid);
    439+
    break;
    440+
    }
    410441
    }
    411442

    412443
    return rel;
    @@ -892,6 +923,8 @@ AddNewRelationTuple(Relation pg_class_desc,
    892923
    Oid reloftype,
    893924
    Oid relowner,
    894925
    char relkind,
    926+
    TransactionId relfrozenxid,
    927+
    TransactionId relminmxid,
    895928
    Datum relacl,
    896929
    Datum reloptions)
    897930
    {
    @@ -928,40 +961,8 @@ AddNewRelationTuple(Relation pg_class_desc,
    928961
    break;
    929962
    }
    930963

    931-
    /* Initialize relfrozenxid and relminmxid */
    932-
    if (relkind == RELKIND_RELATION ||
    933-
    relkind == RELKIND_MATVIEW ||
    934-
    relkind == RELKIND_TOASTVALUE)
    935-
    {
    936-
    /*
    937-
    * Initialize to the minimum XID that could put tuples in the table.
    938-
    * We know that no xacts older than RecentXmin are still running, so
    939-
    * that will do.
    940-
    */
    941-
    new_rel_reltup->relfrozenxid = RecentXmin;
    942-
    943-
    /*
    944-
    * Similarly, initialize the minimum Multixact to the first value that
    945-
    * could possibly be stored in tuples in the table. Running
    946-
    * transactions could reuse values from their local cache, so we are
    947-
    * careful to consider all currently running multis.
    948-
    *
    949-
    * XXX this could be refined further, but is it worth the hassle?
    950-
    */
    951-
    new_rel_reltup->relminmxid = GetOldestMultiXactId();
    952-
    }
    953-
    else
    954-
    {
    955-
    /*
    956-
    * Other relation types will not contain XIDs, so set relfrozenxid to
    957-
    * InvalidTransactionId. (Note: a sequence does contain a tuple, but
    958-
    * we force its xmin to be FrozenTransactionId always; see
    959-
    * commands/sequence.c.)
    960-
    */
    961-
    new_rel_reltup->relfrozenxid = InvalidTransactionId;
    962-
    new_rel_reltup->relminmxid = InvalidMultiXactId;
    963-
    }
    964-
    964+
    new_rel_reltup->relfrozenxid = relfrozenxid;
    965+
    new_rel_reltup->relminmxid = relminmxid;
    965966
    new_rel_reltup->relowner = relowner;
    966967
    new_rel_reltup->reltype = new_type_oid;
    967968
    new_rel_reltup->reloftype = reloftype;
    @@ -1089,6 +1090,8 @@ heap_create_with_catalog(const char *relname,
    10891090
    Oid new_type_oid;
    10901091
    ObjectAddress new_type_addr;
    10911092
    Oid new_array_oid = InvalidOid;
    1093+
    TransactionId relfrozenxid;
    1094+
    MultiXactId relminmxid;
    10921095

    10931096
    pg_class_desc = table_open(RelationRelationId, RowExclusiveLock);
    10941097

    @@ -1220,7 +1223,9 @@ heap_create_with_catalog(const char *relname,
    12201223
    relpersistence,
    12211224
    shared_relation,
    12221225
    mapped_relation,
    1223-
    allow_system_table_mods);
    1226+
    allow_system_table_mods,
    1227+
    &relfrozenxid,
    1228+
    &relminmxid);
    12241229

    12251230
    Assert(relid == RelationGetRelid(new_rel_desc));
    12261231

    @@ -1319,6 +1324,8 @@ heap_create_with_catalog(const char *relname,
    13191324
    reloftypeid,
    13201325
    ownerid,
    13211326
    relkind,
    1327+
    relfrozenxid,
    1328+
    relminmxid,
    13221329
    PointerGetDatum(relacl),
    13231330
    reloptions);
    13241331

    @@ -1407,14 +1414,6 @@ heap_create_with_catalog(const char *relname,
    14071414
    if (oncommit != ONCOMMIT_NOOP)
    14081415
    register_on_commit_action(relid, oncommit);
    14091416

    1410-
    /*
    1411-
    * Unlogged objects need an init fork, except for partitioned tables which
    1412-
    * have no storage at all.
    1413-
    */
    1414-
    if (relpersistence == RELPERSISTENCE_UNLOGGED &&
    1415-
    relkind != RELKIND_PARTITIONED_TABLE)
    1416-
    heap_create_init_fork(new_rel_desc);
    1417-
    14181417
    /*
    14191418
    * ok, the relation has been cataloged, so close our relations and return
    14201419
    * the OID of the newly created relation.
    @@ -1425,27 +1424,6 @@ heap_create_with_catalog(const char *relname,
    14251424
    return relid;
    14261425
    }
    14271426

    1428-
    /*
    1429-
    * Set up an init fork for an unlogged table so that it can be correctly
    1430-
    * reinitialized on restart. An immediate sync is required even if the
    1431-
    * page has been logged, because the write did not go through
    1432-
    * shared_buffers and therefore a concurrent checkpoint may have moved
    1433-
    * the redo pointer past our xlog record. Recovery may as well remove it
    1434-
    * while replaying, for example, XLOG_DBASE_CREATE or XLOG_TBLSPC_CREATE
    1435-
    * record. Therefore, logging is necessary even if wal_level=minimal.
    1436-
    */
    1437-
    void
    1438-
    heap_create_init_fork(Relation rel)
    1439-
    {
    1440-
    Assert(rel->rd_rel->relkind == RELKIND_RELATION ||
    1441-
    rel->rd_rel->relkind == RELKIND_MATVIEW ||
    1442-
    rel->rd_rel->relkind == RELKIND_TOASTVALUE);
    1443-
    RelationOpenSmgr(rel);
    1444-
    smgrcreate(rel->rd_smgr, INIT_FORKNUM, false);
    1445-
    log_smgrcreate(&rel->rd_smgr->smgr_rnode.node, INIT_FORKNUM);
    1446-
    smgrimmedsync(rel->rd_smgr, INIT_FORKNUM);
    1447-
    }
    1448-
    14491427
    /*
    14501428
    * RelationRemoveInheritance
    14511429
    *
    @@ -3168,8 +3146,8 @@ heap_truncate_one_rel(Relation rel)
    31683146
    if (rel->rd_rel->relkind == RELKIND_PARTITIONED_TA 741A BLE)
    31693147
    return;
    31703148

    3171-
    /* Truncate the actual file (and discard buffers) */
    3172-
    RelationTruncate(rel, 0);
    3149+
    /* Truncate the underlying relation */
    3150+
    table_relation_nontransactional_truncate(rel);
    31733151

    31743152
    /* If the relation has indexes, truncate the indexes too */
    31753153
    RelationTruncateIndexes(rel);
    @@ -3180,7 +3158,7 @@ heap_truncate_one_rel(Relation rel)
    31803158
    {
    31813159
    Relation toastrel = table_open(toastrelid, AccessExclusiveLock);
    31823160

    3183-
    RelationTruncate(toastrel, 0);
    3161+
    table_relation_nontransactional_truncate(toastrel);
    31843162
    RelationTruncateIndexes(toastrel);
    31853163
    /* keep the lock... */
    31863164
    table_close(toastrel, NoLock);

    src/backend/catalog/index.c

    Lines changed: 8 additions & 3 deletions
    Original file line numberDiff line numberDiff line change
    @@ -739,6 +739,8 @@ index_create(Relation heapRelation,
    739739
    bool concurrent = (flags & INDEX_CREATE_CONCURRENT) != 0;
    740740
    bool partitioned = (flags & INDEX_CREATE_PARTITIONED) != 0;
    741741
    char relkind;
    742+
    TransactionId relfrozenxid;
    743+
    MultiXactId relminmxid;
    742744

    743745
    /* constraint flags can only be set when a constraint is requested */
    744746
    Assert((constr_flags == 0) ||
    @@ -899,8 +901,12 @@ index_create(Relation heapRelation,
    899901
    relpersistence,
    900902
    shared_relation,
    901903
    mapped_relation,
    902-
    allow_system_table_mods);
    904+
    allow_system_table_mods,
    905+
    &relfrozenxid,
    906+
    &relminmxid);
    903907

    908+
    Assert(relfrozenxid == InvalidTransactionId);
    909+
    Assert(relminmxid == InvalidMultiXactId);
    904910
    Assert(indexRelationId == RelationGetRelid(indexRelation));
    905911

    906912
    /*
    @@ -2850,8 +2856,7 @@ reindex_index(Oid indexId, bool skip_constraint_checks, char persistence,
    28502856
    }
    28512857

    28522858
    /* We'll build a new physical relation for the index */
    2853-
    RelationSetNewRelfilenode(iRel, persistence, InvalidTransactionId,
    2854-
    InvalidMultiXactId);
    2859+
    RelationSetNewRelfilenode(iRel, persistence);
    28552860

    28562861
    /* Initialize the index and rebuild */
    28572862
    /* Note: we do not need to re-establish pkey setting */

    src/backend/catalog/storage.c

    Lines changed: 88 additions & 0 deletions
    Original file line numberDiff line numberDiff line change
    @@ -19,6 +19,8 @@
    1919

    2020
    #include "postgres.h"
    2121

    22+
    #include "miscadmin.h"
    23+
    2224
    #include "access/visibilitymap.h"
    2325
    #include "access/xact.h"
    2426
    #include "access/xlog.h"
    @@ -290,6 +292,92 @@ RelationTruncate(Relation rel, BlockNumber nblocks)
    290292
    smgrtruncate(rel->rd_smgr, MAIN_FORKNUM, nblocks);
    291293
    }
    292294

    295+
    /*
    296+
    * Copy a fork's data, block by block.
    297+
    */
    298+
    void
    299+
    RelationCopyStorage(SMgrRelation src, SMgrRelation dst,
    300+
    ForkNumber forkNum, char relpersistence)
    301+
    {
    302+
    PGAlignedBlock buf;
    303+
    Page page;
    304+
    bool use_wal;
    305+
    bool copying_initfork;
    306+
    BlockNumber nblocks;
    307+
    BlockNumber blkno;
    308+
    309+
    page = (Page) buf.data;
    310+
    311+
    /*
    312+
    * The init fork for an unlogged relation in many respects has to be
    313+
    * treated the same as normal relation, changes need to be WAL logged and
    314+
    * it needs to be synced to disk.
    315+
    */
    316+
    copying_initfork = relpersistence == RELPERSISTENCE_UNLOGGED &&
    317+
    forkNum == INIT_FORKNUM;
    318+
    319+
    /*
    320+
    * We need to log the copied data in WAL iff WAL archiving/streaming is
    321+
    * enabled AND it's a permanent relation.
    322+
    */
    323+
    use_wal = XLogIsNeeded() &&
    324+
    (relpersistence == RELPERSISTENCE_PERMANENT || copying_initfork);
    325+
    326+
    nblocks = smgrnblocks(src, forkNum);
    327+
    328+
    for (blkno = 0; blkno < nblocks; blkno++)
    329+
    {
    330+
    /* If we got a cancel signal during the copy of the data, quit */
    331+
    CHECK_FOR_INTERRUPTS();
    332+
    333+
    smgrread(src, forkNum, blkno, buf.data);
    334+
    335+
    if (!PageIsVerified(page, blkno))
    336+
    ereport(ERROR,
    337+
    (errcode(ERRCODE_DATA_CORRUPTED),
    338+
    errmsg("invalid page in block %u of relation %s",
    339+
    blkno,
    340+
    relpathbackend(src->smgr_rnode.node,
    341+
    src->smgr_rnode.backend,
    342+
    forkNum))));
    343+
    344+
    /*
    345+
    * WAL-log the copied page. Unfortunately we don't know what kind of a
    346+
    * page this is, so we have to log the full page including any unused
    347+
    * space.
    348+
    */
    349+
    if (use_wal)
    350+
    log_newpage(&dst->smgr_rnode.node, forkNum, blkno, page, false);
    351+
    352+
    PageSetChecksumInplace(page, blkno);
    353+
    354+
    /*
    355+
    * Now write the page. We say isTemp = true even if it's not a temp
    356+
    * rel, because there's no need for smgr to schedule an fsync for this
    357+
    * write; we'll do it ourselves below.
    358+
    */
    359+
    smgrextend(dst, forkNum, blkno, buf.data, true);
    360+
    }
    361+
    362+
    /*
    363+
    * If the rel is WAL-logged, must fsync before commit. We use heap_sync
    364+
    * to ensure that the toast table gets fsync'd too. (For a temp or
    365+
    * unlogged rel we don't care since the data will be gone after a crash
    366+
    * anyway.)
    367+
    *
    368+
    * It's obvious that we must do this when not WAL-logging the copy. It's
    369+
    * less obvious that we have to do it even if we did WAL-log the copied
    370+
    * pages. The reason is that since we're copying outside shared buffers, a
    371+
    * CHECKPOINT occurring during the copy has no way to flush the previously
    372+
    * written data to disk (indeed it won't know the new rel even exists). A
    373+
    * crash later on would replay WAL from the checkpoint, therefore it
    374+
    * wouldn't replay our earlier WAL entries. If we do not fsync those pages
    375+
    * here, they might still not be on disk when the crash occurs.
    376+
    */
    377+
    if (relpersistence == RELPERSISTENCE_PERMANENT || copying_initfork)
    378+
    smgrimmedsync(dst, forkNum);
    379+
    }
    380+
    293381
    /*
    294382
    * smgrDoPendingDeletes() -- Take care of relation deletes at end of xact.
    295383
    *

    0 commit comments

    Comments
     (0)
    0