mirror of
https://github.com/postgres/postgres.git
synced 2025-09-03 15:22:11 +03:00
Revert "Skip WAL for new relfilenodes, under wal_level=minimal."
This reverts commit cb2fd7eac2
. Per
numerous buildfarm members, it was incompatible with parallel query, and
a test case assumed LP64. Back-patch to 9.5 (all supported versions).
Discussion: https://postgr.es/m/20200321224920.GB1763544@rfd.leadboat.com
This commit is contained in:
@@ -763,6 +763,7 @@ copy_heap_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex, bool verbose,
|
||||
bool *isnull;
|
||||
IndexScanDesc indexScan;
|
||||
HeapScanDesc heapScan;
|
||||
bool use_wal;
|
||||
bool is_system_catalog;
|
||||
TransactionId OldestXmin;
|
||||
TransactionId FreezeXid;
|
||||
@@ -819,9 +820,12 @@ copy_heap_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex, bool verbose,
|
||||
LockRelationOid(OldHeap->rd_rel->reltoastrelid, AccessExclusiveLock);
|
||||
|
||||
/*
|
||||
* Valid smgr_targblock implies something already wrote to the relation.
|
||||
* This may be harmless, but this function hasn't planned for it.
|
||||
* We need to log the copied data in WAL iff WAL archiving/streaming is
|
||||
* enabled AND it's a WAL-logged rel.
|
||||
*/
|
||||
use_wal = XLogIsNeeded() && RelationNeedsWAL(NewHeap);
|
||||
|
||||
/* use_wal off requires smgr_targblock be initially invalid */
|
||||
Assert(RelationGetTargetBlock(NewHeap) == InvalidBlockNumber);
|
||||
|
||||
/*
|
||||
@@ -889,7 +893,7 @@ copy_heap_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex, bool verbose,
|
||||
|
||||
/* Initialize the rewrite operation */
|
||||
rwstate = begin_heap_rewrite(OldHeap, NewHeap, OldestXmin, FreezeXid,
|
||||
MultiXactCutoff);
|
||||
MultiXactCutoff, use_wal);
|
||||
|
||||
/*
|
||||
* Decide whether to use an indexscan or seqscan-and-optional-sort to scan
|
||||
@@ -1281,25 +1285,6 @@ swap_relation_files(Oid r1, Oid r2, bool target_is_pg_class,
|
||||
*mapped_tables++ = r2;
|
||||
}
|
||||
|
||||
/*
|
||||
* Recognize that rel1's relfilenode (swapped from rel2) is new in this
|
||||
* subtransaction. The rel2 storage (swapped from rel1) may or may not be
|
||||
* new.
|
||||
*/
|
||||
{
|
||||
Relation rel1,
|
||||
rel2;
|
||||
|
||||
rel1 = relation_open(r1, NoLock);
|
||||
rel2 = relation_open(r2, NoLock);
|
||||
rel2->rd_createSubid = rel1->rd_createSubid;
|
||||
rel2->rd_newRelfilenodeSubid = rel1->rd_newRelfilenodeSubid;
|
||||
rel2->rd_firstRelfilenodeSubid = rel1->rd_firstRelfilenodeSubid;
|
||||
RelationAssumeNewRelfilenode(rel1);
|
||||
relation_close(rel1, NoLock);
|
||||
relation_close(rel2, NoLock);
|
||||
}
|
||||
|
||||
/*
|
||||
* In the case of a shared catalog, these next few steps will only affect
|
||||
* our own database's pg_class row; but that's okay, because they are all
|
||||
|
@@ -2382,17 +2382,65 @@ CopyFrom(CopyState cstate)
|
||||
|
||||
tupDesc = RelationGetDescr(cstate->rel);
|
||||
|
||||
/*
|
||||
* If the target file is new-in-transaction, we assume that checking FSM
|
||||
* for free space is a waste of time. This could possibly be wrong, but
|
||||
* it's unlikely.
|
||||
/*----------
|
||||
* Check to see if we can avoid writing WAL
|
||||
*
|
||||
* If archive logging/streaming is not enabled *and* either
|
||||
* - table was created in same transaction as this COPY
|
||||
* - data is being written to relfilenode created in this transaction
|
||||
* then we can skip writing WAL. It's safe because if the transaction
|
||||
* doesn't commit, we'll discard the table (or the new relfilenode file).
|
||||
* If it does commit, we'll have done the heap_sync at the bottom of this
|
||||
* routine first.
|
||||
*
|
||||
* As mentioned in comments in utils/rel.h, the in-same-transaction test
|
||||
* is not always set correctly, since in rare cases rd_newRelfilenodeSubid
|
||||
* can be cleared before the end of the transaction. The exact case is
|
||||
* when a relation sets a new relfilenode twice in same transaction, yet
|
||||
* the second one fails in an aborted subtransaction, e.g.
|
||||
*
|
||||
* BEGIN;
|
||||
* TRUNCATE t;
|
||||
* SAVEPOINT save;
|
||||
* TRUNCATE t;
|
||||
* ROLLBACK TO save;
|
||||
* COPY ...
|
||||
*
|
||||
* Also, if the target file is new-in-transaction, we assume that checking
|
||||
* FSM for free space is a waste of time, even if we must use WAL because
|
||||
* of archiving. This could possibly be wrong, but it's unlikely.
|
||||
*
|
||||
* The comments for heap_insert and RelationGetBufferForTuple specify that
|
||||
* skipping WAL logging is only safe if we ensure that our tuples do not
|
||||
* go into pages containing tuples from any other transactions --- but this
|
||||
* must be the case if we have a new table or new relfilenode, so we need
|
||||
* no additional work to enforce that.
|
||||
*
|
||||
* We currently don't support this optimization if the COPY target is a
|
||||
* partitioned table as we currently only lazily initialize partition
|
||||
* information when routing the first tuple to the partition. We cannot
|
||||
* know at this stage if we can perform this optimization. It should be
|
||||
* possible to improve on this, but it does mean maintaining heap insert
|
||||
* option flags per partition and setting them when we first open the
|
||||
* partition.
|
||||
*
|
||||
* This optimization is not supported for relation types which do not
|
||||
* have any physical storage, with foreign tables and views using
|
||||
* INSTEAD OF triggers entering in this category. Partitioned tables
|
||||
* are not supported as per the description above.
|
||||
*----------
|
||||
*/
|
||||
/* createSubid is creation check, newRelfilenodeSubid is truncation check */
|
||||
if (cstate->rel->rd_rel->relkind != RELKIND_FOREIGN_TABLE &&
|
||||
cstate->rel->rd_rel->relkind != RELKIND_PARTITIONED_TABLE &&
|
||||
cstate->rel->rd_rel->relkind != RELKIND_VIEW &&
|
||||
(cstate->rel->rd_createSubid != InvalidSubTransactionId ||
|
||||
cstate->rel->rd_firstRelfilenodeSubid != InvalidSubTransactionId))
|
||||
cstate->rel->rd_newRelfilenodeSubid != InvalidSubTransactionId))
|
||||
{
|
||||
hi_options |= HEAP_INSERT_SKIP_FSM;
|
||||
if (!XLogIsNeeded())
|
||||
hi_options |= HEAP_INSERT_SKIP_WAL;
|
||||
}
|
||||
|
||||
/*
|
||||
* Optimize if new relfilenode was created in this subxact or one of its
|
||||
@@ -2884,6 +2932,13 @@ next_tuple:
|
||||
|
||||
FreeExecutorState(estate);
|
||||
|
||||
/*
|
||||
* If we skipped writing WAL, then we need to sync the heap (but not
|
||||
* indexes since those use WAL anyway)
|
||||
*/
|
||||
if (hi_options & HEAP_INSERT_SKIP_WAL)
|
||||
heap_sync(cstate->rel);
|
||||
|
||||
return processed;
|
||||
}
|
||||
|
||||
|
@@ -562,13 +562,16 @@ intorel_startup(DestReceiver *self, int operation, TupleDesc typeinfo)
|
||||
myState->rel = intoRelationDesc;
|
||||
myState->reladdr = intoRelationAddr;
|
||||
myState->output_cid = GetCurrentCommandId(true);
|
||||
myState->hi_options = HEAP_INSERT_SKIP_FSM;
|
||||
myState->bistate = GetBulkInsertState();
|
||||
|
||||
/*
|
||||
* Valid smgr_targblock implies something already wrote to the relation.
|
||||
* This may be harmless, but this function hasn't planned for it.
|
||||
* We can skip WAL-logging the insertions, unless PITR or streaming
|
||||
* replication is in use. We can skip the FSM in any case.
|
||||
*/
|
||||
myState->hi_options = HEAP_INSERT_SKIP_FSM |
|
||||
(XLogIsNeeded() ? 0 : HEAP_INSERT_SKIP_WAL);
|
||||
myState->bistate = GetBulkInsertState();
|
||||
|
||||
/* Not using WAL requires smgr_targblock be initially invalid */
|
||||
Assert(RelationGetTargetBlock(intoRelationDesc) == InvalidBlockNumber);
|
||||
}
|
||||
|
||||
@@ -614,6 +617,10 @@ intorel_shutdown(DestReceiver *self)
|
||||
|
||||
FreeBulkInsertState(myState->bistate);
|
||||
|
||||
/* If we skipped using WAL, must heap_sync before commit */
|
||||
if (myState->hi_options & HEAP_INSERT_SKIP_WAL)
|
||||
heap_sync(myState->rel);
|
||||
|
||||
/* close rel, but keep lock until commit */
|
||||
heap_close(myState->rel, NoLock);
|
||||
myState->rel = NULL;
|
||||
|
@@ -1050,8 +1050,6 @@ DefineIndex(Oid relationId,
|
||||
childStmt->relationId = childRelid;
|
||||
childStmt->indexOid = InvalidOid;
|
||||
childStmt->oldNode = InvalidOid;
|
||||
childStmt->oldCreateSubid = InvalidSubTransactionId;
|
||||
childStmt->oldFirstRelfilenodeSubid = InvalidSubTransactionId;
|
||||
|
||||
/*
|
||||
* Adjust any Vars (both in expressions and in the index's
|
||||
|
@@ -457,13 +457,17 @@ transientrel_startup(DestReceiver *self, int operation, TupleDesc typeinfo)
|
||||
*/
|
||||
myState->transientrel = transientrel;
|
||||
myState->output_cid = GetCurrentCommandId(true);
|
||||
myState->hi_options = HEAP_INSERT_SKIP_FSM | HEAP_INSERT_FROZEN;
|
||||
myState->bistate = GetBulkInsertState();
|
||||
|
||||
/*
|
||||
* Valid smgr_targblock implies something already wrote to the relation.
|
||||
* This may be harmless, but this function hasn't planned for it.
|
||||
* We can skip WAL-logging the insertions, unless PITR or streaming
|
||||
* replication is in use. We can skip the FSM in any case.
|
||||
*/
|
||||
myState->hi_options = HEAP_INSERT_SKIP_FSM | HEAP_INSERT_FROZEN;
|
||||
if (!XLogIsNeeded())
|
||||
myState->hi_options |= HEAP_INSERT_SKIP_WAL;
|
||||
myState->bistate = GetBulkInsertState();
|
||||
|
||||
/* Not using WAL requires smgr_targblock be initially invalid */
|
||||
Assert(RelationGetTargetBlock(transientrel) == InvalidBlockNumber);
|
||||
}
|
||||
|
||||
@@ -503,6 +507,10 @@ transientrel_shutdown(DestReceiver *self)
|
||||
|
||||
FreeBulkInsertState(myState->bistate);
|
||||
|
||||
/* If we skipped using WAL, must heap_sync before commit */
|
||||
if (myState->hi_options & HEAP_INSERT_SKIP_WAL)
|
||||
heap_sync(myState->transientrel);
|
||||
|
||||
/* close transientrel, but keep lock until commit */
|
||||
heap_close(myState->transientrel, NoLock);
|
||||
myState->transientrel = NULL;
|
||||
|
@@ -4650,14 +4650,19 @@ ATRewriteTable(AlteredTableInfo *tab, Oid OIDNewHeap, LOCKMODE lockmode)
|
||||
newrel = NULL;
|
||||
|
||||
/*
|
||||
* Prepare a BulkInsertState and options for heap_insert. The FSM is
|
||||
* empty, so don't bother using it.
|
||||
* Prepare a BulkInsertState and options for heap_insert. Because we're
|
||||
* building a new heap, we can skip WAL-logging and fsync it to disk at
|
||||
* the end instead (unless WAL-logging is required for archiving or
|
||||
* streaming replication). The FSM is empty too, so don't bother using it.
|
||||
*/
|
||||
if (newrel)
|
||||
{
|
||||
mycid = GetCurrentCommandId(true);
|
||||
bistate = GetBulkInsertState();
|
||||
|
||||
hi_options = HEAP_INSERT_SKIP_FSM;
|
||||
if (!XLogIsNeeded())
|
||||
hi_options |= HEAP_INSERT_SKIP_WAL;
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -4929,6 +4934,10 @@ ATRewriteTable(AlteredTableInfo *tab, Oid OIDNewHeap, LOCKMODE lockmode)
|
||||
{
|
||||
FreeBulkInsertState(bistate);
|
||||
|
||||
/* If we skipped writing WAL, then we need to sync the heap. */
|
||||
if (hi_options & HEAP_INSERT_SKIP_WAL)
|
||||
heap_sync(newrel);
|
||||
|
||||
heap_close(newrel, NoLock);
|
||||
}
|
||||
}
|
||||
@@ -7072,19 +7081,14 @@ ATExecAddIndex(AlteredTableInfo *tab, Relation rel,
|
||||
|
||||
/*
|
||||
* If TryReuseIndex() stashed a relfilenode for us, we used it for the new
|
||||
* index instead of building from scratch. Restore associated fields.
|
||||
* This may store InvalidSubTransactionId in both fields, in which case
|
||||
* relcache.c will assume it can rebuild the relcache entry. Hence, do
|
||||
* this after the CCI that made catalog rows visible to any rebuild. The
|
||||
* DROP of the old edition of this index will have scheduled the storage
|
||||
* for deletion at commit, so cancel that pending deletion.
|
||||
* index instead of building from scratch. The DROP of the old edition of
|
||||
* this index will have scheduled the storage for deletion at commit, so
|
||||
* cancel that pending deletion.
|
||||
*/
|
||||
if (OidIsValid(stmt->oldNode))
|
||||
{
|
||||
Relation irel = index_open(address.objectId, NoLock);
|
||||
|
||||
irel->rd_createSubid = stmt->oldCreateSubid;
|
||||
irel->rd_firstRelfilenodeSubid = stmt->oldFirstRelfilenodeSubid;
|
||||
RelationPreserveStorage(irel->rd_node, true);
|
||||
index_close(irel, NoLock);
|
||||
}
|
||||
@@ -10876,11 +10880,7 @@ TryReuseIndex(Oid oldId, IndexStmt *stmt)
|
||||
|
||||
/* If it's a partitioned index, there is no storage to share. */
|
||||
if (irel->rd_rel->relkind != RELKIND_PARTITIONED_INDEX)
|
||||
{
|
||||
stmt->oldNode = irel->rd_node.relNode;
|
||||
stmt->oldCreateSubid = irel->rd_createSubid;
|
||||
stmt->oldFirstRelfilenodeSubid = irel->rd_firstRelfilenodeSubid;
|
||||
}
|
||||
index_close(irel, NoLock);
|
||||
}
|
||||
}
|
||||
@@ -11735,8 +11735,6 @@ ATExecSetTableSpace(Oid tableOid, Oid newTableSpace, LOCKMODE lockmode)
|
||||
|
||||
heap_close(pg_class, RowExclusiveLock);
|
||||
|
||||
RelationAssumeNewRelfilenode(rel);
|
||||
|
||||
relation_close(rel, NoLock);
|
||||
|
||||
/* Make sure the reltablespace change is visible */
|
||||
@@ -12008,9 +12006,7 @@ copy_relation_data(SMgrRelation src, SMgrRelation dst,
|
||||
|
||||
/*
|
||||
* We need to log the copied data in WAL iff WAL archiving/streaming is
|
||||
* enabled AND it's a permanent relation. This gives the same answer as
|
||||
* "RelationNeedsWAL(rel) || copying_initfork", because we know the
|
||||
* current operation created a new relfilenode.
|
||||
* enabled AND it's a permanent relation.
|
||||
*/
|
||||
use_wal = XLogIsNeeded() &&
|
||||
(relpersistence == RELPERSISTENCE_PERMANENT || copying_initfork);
|
||||
@@ -12052,15 +12048,21 @@ copy_relation_data(SMgrRelation src, SMgrRelation dst,
|
||||
}
|
||||
|
||||
/*
|
||||
* When we WAL-logged rel pages, we must nonetheless fsync them. The
|
||||
* reason is that since we're copying outside shared buffers, a CHECKPOINT
|
||||
* occurring during the copy has no way to flush the previously written
|
||||
* data to disk (indeed it won't know the new rel even exists). A crash
|
||||
* later on would replay WAL from the checkpoint, therefore it wouldn't
|
||||
* replay our earlier WAL entries. If we do not fsync those pages here,
|
||||
* they might still not be on disk when the crash occurs.
|
||||
* If the rel is WAL-logged, must fsync before commit. We use heap_sync
|
||||
* to ensure that the toast table gets fsync'd too. (For a temp or
|
||||
* unlogged rel we don't care since the data will be gone after a crash
|
||||
* anyway.)
|
||||
*
|
||||
* It's obvious that we must do this when not WAL-logging the copy. It's
|
||||
* less obvious that we have to do it even if we did WAL-log the copied
|
||||
* pages. The reason is that since we're copying outside shared buffers, a
|
||||
* CHECKPOINT occurring during the copy has no way to flush the previously
|
||||
* written data to disk (indeed it won't know the new rel even exists). A
|
||||
* crash later on would replay WAL from the checkpoint, therefore it
|
||||
* wouldn't replay our earlier WAL entries. If we do not fsync those pages
|
||||
* here, they might still not be on disk when the crash occurs.
|
||||
*/
|
||||
if (use_wal || copying_initfork)
|
||||
if (relpersistence == RELPERSISTENCE_PERMANENT || copying_initfork)
|
||||
smgrimmedsync(dst, forkNum);
|
||||
}
|
||||
|
||||
|
Reference in New Issue
Block a user