1
0
mirror of https://github.com/postgres/postgres.git synced 2025-11-24 00:23:06 +03:00

Revert "Skip WAL for new relfilenodes, under wal_level=minimal."

This reverts commit cb2fd7eac2.  Per
numerous buildfarm members, it was incompatible with parallel query, and
a test case assumed LP64.  Back-patch to 9.5 (all supported versions).

Discussion: https://postgr.es/m/20200321224920.GB1763544@rfd.leadboat.com
This commit is contained in:
Noah Misch
2020-03-22 09:24:09 -07:00
parent d0587f52b3
commit de9396326e
51 changed files with 362 additions and 1438 deletions

View File

@@ -29,13 +29,9 @@
#include "miscadmin.h"
#include "storage/freespace.h"
#include "storage/smgr.h"
#include "utils/hsearch.h"
#include "utils/memutils.h"
#include "utils/rel.h"
/* GUC variables */
int wal_skip_threshold = 2048; /* in kilobytes */
/*
* We keep a list of all relations (represented as RelFileNode values)
* that have been created or deleted in the current transaction. When
@@ -65,14 +61,7 @@ typedef struct PendingRelDelete
struct PendingRelDelete *next; /* linked-list link */
} PendingRelDelete;
typedef struct pendingSync
{
RelFileNode rnode;
bool is_truncated; /* Has the file experienced truncation? */
} pendingSync;
static PendingRelDelete *pendingDeletes = NULL; /* head of linked list */
HTAB *pendingSyncHash = NULL;
/*
* RelationCreateStorage
@@ -128,32 +117,6 @@ RelationCreateStorage(RelFileNode rnode, char relpersistence)
pending->next = pendingDeletes;
pendingDeletes = pending;
/* Queue an at-commit sync. */
if (relpersistence == RELPERSISTENCE_PERMANENT && !XLogIsNeeded())
{
pendingSync *pending;
bool found;
/* we sync only permanent relations */
Assert(backend == InvalidBackendId);
if (!pendingSyncHash)
{
HASHCTL ctl;
ctl.keysize = sizeof(RelFileNode);
ctl.entrysize = sizeof(pendingSync);
ctl.hcxt = TopTransactionContext;
pendingSyncHash =
hash_create("pending sync hash",
16, &ctl, HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
}
pending = hash_search(pendingSyncHash, &rnode, HASH_ENTER, &found);
Assert(!found);
pending->is_truncated = false;
}
return srel;
}
@@ -312,8 +275,6 @@ RelationTruncate(Relation rel, BlockNumber nblocks)
}
}
RelationPreTruncate(rel);
/*
* We WAL-log the truncation before actually truncating, which means
* trouble if the truncation fails. If we then crash, the WAL replay
@@ -364,28 +325,6 @@ RelationTruncate(Relation rel, BlockNumber nblocks)
FreeSpaceMapVacuumRange(rel, nblocks, InvalidBlockNumber);
}
/*
* RelationPreTruncate
* Perform AM-independent work before a physical truncation.
*
* If an access method's relation_nontransactional_truncate does not call
* RelationTruncate(), it must call this before decreasing the table size.
*/
void
RelationPreTruncate(Relation rel)
{
pendingSync *pending;
if (!pendingSyncHash)
return;
RelationOpenSmgr(rel);
pending = hash_search(pendingSyncHash, &(rel->rd_smgr->smgr_rnode.node),
HASH_FIND, NULL);
if (pending)
pending->is_truncated = true;
}
/*
* Copy a fork's data, block by block.
*
@@ -416,9 +355,7 @@ RelationCopyStorage(SMgrRelation src, SMgrRelation dst,
/*
* We need to log the copied data in WAL iff WAL archiving/streaming is
* enabled AND it's a permanent relation. This gives the same answer as
* "RelationNeedsWAL(rel) || copying_initfork", because we know the
* current operation created a new relfilenode.
* enabled AND it's a permanent relation.
*/
use_wal = XLogIsNeeded() &&
(relpersistence == RELPERSISTENCE_PERMANENT || copying_initfork);
@@ -460,39 +397,24 @@ RelationCopyStorage(SMgrRelation src, SMgrRelation dst,
}
/*
* When we WAL-logged rel pages, we must nonetheless fsync them. The
* reason is that since we're copying outside shared buffers, a CHECKPOINT
* occurring during the copy has no way to flush the previously written
* data to disk (indeed it won't know the new rel even exists). A crash
* later on would replay WAL from the checkpoint, therefore it wouldn't
* replay our earlier WAL entries. If we do not fsync those pages here,
* they might still not be on disk when the crash occurs.
* If the rel is WAL-logged, must fsync before commit. We use heap_sync
* to ensure that the toast table gets fsync'd too. (For a temp or
* unlogged rel we don't care since the data will be gone after a crash
* anyway.)
*
* It's obvious that we must do this when not WAL-logging the copy. It's
* less obvious that we have to do it even if we did WAL-log the copied
* pages. The reason is that since we're copying outside shared buffers, a
* CHECKPOINT occurring during the copy has no way to flush the previously
* written data to disk (indeed it won't know the new rel even exists). A
* crash later on would replay WAL from the checkpoint, therefore it
* wouldn't replay our earlier WAL entries. If we do not fsync those pages
* here, they might still not be on disk when the crash occurs.
*/
if (use_wal || copying_initfork)
if (relpersistence == RELPERSISTENCE_PERMANENT || copying_initfork)
smgrimmedsync(dst, forkNum);
}
/*
* RelFileNodeSkippingWAL - check if a BM_PERMANENT relfilenode is using WAL
*
* Changes of certain relfilenodes must not write WAL; see "Skipping WAL for
* New RelFileNode" in src/backend/access/transam/README. Though it is
* known from Relation efficiently, this function is intended for the code
* paths not having access to Relation.
*/
bool
RelFileNodeSkippingWAL(RelFileNode rnode)
{
if (XLogIsNeeded())
return false; /* no permanent relfilenode skips WAL */
if (!pendingSyncHash ||
hash_search(pendingSyncHash, &rnode, HASH_FIND, NULL) == NULL)
return false;
return true;
}
/*
* smgrDoPendingDeletes() -- Take care of relation deletes at end of xact.
*
@@ -570,144 +492,6 @@ smgrDoPendingDeletes(bool isCommit)
}
}
/*
* smgrDoPendingSyncs() -- Take care of relation syncs at end of xact.
*/
void
smgrDoPendingSyncs(bool isCommit)
{
PendingRelDelete *pending;
int nrels = 0,
maxrels = 0;
SMgrRelation *srels = NULL;
HASH_SEQ_STATUS scan;
pendingSync *pendingsync;
if (XLogIsNeeded())
return; /* no relation can use this */
Assert(GetCurrentTransactionNestLevel() == 1);
if (!pendingSyncHash)
return; /* no relation needs sync */
/* Just throw away all pending syncs if any at rollback */
if (!isCommit)
{
pendingSyncHash = NULL;
return;
}
AssertPendingSyncs_RelationCache();
/* Skip syncing nodes that smgrDoPendingDeletes() will delete. */
for (pending = pendingDeletes; pending != NULL; pending = pending->next)
{
if (!pending->atCommit)
continue;
(void) hash_search(pendingSyncHash, (void *) &pending->relnode,
HASH_REMOVE, NULL);
}
hash_seq_init(&scan, pendingSyncHash);
while ((pendingsync = (pendingSync *) hash_seq_search(&scan)))
{
ForkNumber fork;
BlockNumber nblocks[MAX_FORKNUM + 1];
BlockNumber total_blocks = 0;
SMgrRelation srel;
srel = smgropen(pendingsync->rnode, InvalidBackendId);
/*
* We emit newpage WAL records for smaller relations.
*
* Small WAL records have a chance to be emitted along with other
* backends' WAL records. We emit WAL records instead of syncing for
* files that are smaller than a certain threshold, expecting faster
* commit. The threshold is defined by the GUC wal_skip_threshold.
*/
if (!pendingsync->is_truncated)
{
for (fork = 0; fork <= MAX_FORKNUM; fork++)
{
if (smgrexists(srel, fork))
{
BlockNumber n = smgrnblocks(srel, fork);
/* we shouldn't come here for unlogged relations */
Assert(fork != INIT_FORKNUM);
nblocks[fork] = n;
total_blocks += n;
}
else
nblocks[fork] = InvalidBlockNumber;
}
}
/*
* Sync file or emit WAL records for its contents.
*
* Although we emit WAL record if the file is small enough, do file
* sync regardless of the size if the file has experienced a
* truncation. It is because the file would be followed by trailing
* garbage blocks after a crash recovery if, while a past longer file
* had been flushed out, we omitted syncing-out of the file and
* emitted WAL instead. You might think that we could choose WAL if
* the current main fork is longer than ever, but there's a case where
* main fork is longer than ever but FSM fork gets shorter.
*/
if (pendingsync->is_truncated ||
total_blocks * BLCKSZ / 1024 >= wal_skip_threshold)
{
/* allocate the initial array, or extend it, if needed */
if (maxrels == 0)
{
maxrels = 8;
srels = palloc(sizeof(SMgrRelation) * maxrels);
}
else if (maxrels <= nrels)
{
maxrels *= 2;
srels = repalloc(srels, sizeof(SMgrRelation) * maxrels);
}
srels[nrels++] = srel;
}
else
{
/* Emit WAL records for all blocks. The file is small enough. */
for (fork = 0; fork <= MAX_FORKNUM; fork++)
{
int n = nblocks[fork];
Relation rel;
if (!BlockNumberIsValid(n))
continue;
/*
* Emit WAL for the whole file. Unfortunately we don't know
* what kind of a page this is, so we have to log the full
* page including any unused space. ReadBufferExtended()
* counts some pgstat events; unfortunately, we discard them.
*/
rel = CreateFakeRelcacheEntry(srel->smgr_rnode.node);
log_newpage_range(rel, fork, 0, n, false);
FreeFakeRelcacheEntry(rel);
}
}
}
pendingSyncHash = NULL;
if (nrels > 0)
{
smgrdosyncall(srels, nrels);
pfree(srels);
}
}
/*
* smgrGetPendingDeletes() -- Get a list of non-temp relations to be deleted.
*