mirror of
https://github.com/postgres/postgres.git
synced 2025-08-19 23:22:23 +03:00
Some tests try to invalidate logical slots on the standby server by running VACUUM on the primary. The problem is that xl_running_xacts was getting generated and replayed before the VACUUM command, leading to the advancement of the active slot's catalog_xmin. Due to this, active slots were not getting invalidated, leading to test failures. We fix it by skipping the generation of xl_running_xacts for the required tests with the help of injection points. As the required interface for injection points was not present in back branches, we fixed the failing tests in them by disallowing the slot to become active for the required cases (where rows_removed conflict could be generated). Author: Hayato Kuroda <kuroda.hayato@fujitsu.com> Reviewed-by: Bertrand Drouvot <bertranddrouvot.pg@gmail.com> Reviewed-by: Amit Kapila <amit.kapila16@gmail.com> Backpatch-through: 16, where it was introduced Discussion: https://postgr.es/m/Z6oQXc8LmiTLfwLA@ip-10-97-1-34.eu-west-3.compute.internal
1525 lines
48 KiB
C
1525 lines
48 KiB
C
/*-------------------------------------------------------------------------
|
|
*
|
|
* standby.c
|
|
* Misc functions used in Hot Standby mode.
|
|
*
|
|
* All functions for handling RM_STANDBY_ID, which relate to
|
|
* AccessExclusiveLocks and starting snapshots for Hot Standby mode.
|
|
* Plus conflict recovery processing.
|
|
*
|
|
* Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
|
|
* Portions Copyright (c) 1994, Regents of the University of California
|
|
*
|
|
* IDENTIFICATION
|
|
* src/backend/storage/ipc/standby.c
|
|
*
|
|
*-------------------------------------------------------------------------
|
|
*/
|
|
#include "postgres.h"
|
|
#include "access/transam.h"
|
|
#include "access/twophase.h"
|
|
#include "access/xact.h"
|
|
#include "access/xloginsert.h"
|
|
#include "access/xlogrecovery.h"
|
|
#include "access/xlogutils.h"
|
|
#include "miscadmin.h"
|
|
#include "pgstat.h"
|
|
#include "replication/slot.h"
|
|
#include "storage/bufmgr.h"
|
|
#include "storage/proc.h"
|
|
#include "storage/procarray.h"
|
|
#include "storage/sinvaladt.h"
|
|
#include "storage/standby.h"
|
|
#include "utils/hsearch.h"
|
|
#include "utils/injection_point.h"
|
|
#include "utils/ps_status.h"
|
|
#include "utils/timeout.h"
|
|
#include "utils/timestamp.h"
|
|
|
|
/* User-settable GUC parameters */
|
|
int max_standby_archive_delay = 30 * 1000;
|
|
int max_standby_streaming_delay = 30 * 1000;
|
|
bool log_recovery_conflict_waits = false;
|
|
|
|
/*
|
|
* Keep track of all the exclusive locks owned by original transactions.
|
|
* For each known exclusive lock, there is a RecoveryLockEntry in the
|
|
* RecoveryLockHash hash table. All RecoveryLockEntrys belonging to a
|
|
* given XID are chained together so that we can find them easily.
|
|
* For each original transaction that is known to have any such locks,
|
|
* there is a RecoveryLockXidEntry in the RecoveryLockXidHash hash table,
|
|
* which stores the head of the chain of its locks.
|
|
*/
|
|
typedef struct RecoveryLockEntry
|
|
{
|
|
xl_standby_lock key; /* hash key: xid, dbOid, relOid */
|
|
struct RecoveryLockEntry *next; /* chain link */
|
|
} RecoveryLockEntry;
|
|
|
|
typedef struct RecoveryLockXidEntry
|
|
{
|
|
TransactionId xid; /* hash key -- must be first */
|
|
struct RecoveryLockEntry *head; /* chain head */
|
|
} RecoveryLockXidEntry;
|
|
|
|
static HTAB *RecoveryLockHash = NULL;
|
|
static HTAB *RecoveryLockXidHash = NULL;
|
|
|
|
/* Flags set by timeout handlers */
|
|
static volatile sig_atomic_t got_standby_deadlock_timeout = false;
|
|
static volatile sig_atomic_t got_standby_delay_timeout = false;
|
|
static volatile sig_atomic_t got_standby_lock_timeout = false;
|
|
|
|
static void ResolveRecoveryConflictWithVirtualXIDs(VirtualTransactionId *waitlist,
|
|
ProcSignalReason reason,
|
|
uint32 wait_event_info,
|
|
bool report_waiting);
|
|
static void SendRecoveryConflictWithBufferPin(ProcSignalReason reason);
|
|
static XLogRecPtr LogCurrentRunningXacts(RunningTransactions CurrRunningXacts);
|
|
static void LogAccessExclusiveLocks(int nlocks, xl_standby_lock *locks);
|
|
static const char *get_recovery_conflict_desc(ProcSignalReason reason);
|
|
|
|
/*
|
|
* InitRecoveryTransactionEnvironment
|
|
* Initialize tracking of our primary's in-progress transactions.
|
|
*
|
|
* We need to issue shared invalidations and hold locks. Holding locks
|
|
* means others may want to wait on us, so we need to make a lock table
|
|
* vxact entry like a real transaction. We could create and delete
|
|
* lock table entries for each transaction but its simpler just to create
|
|
* one permanent entry and leave it there all the time. Locks are then
|
|
* acquired and released as needed. Yes, this means you can see the
|
|
* Startup process in pg_locks once we have run this.
|
|
*/
|
|
void
|
|
InitRecoveryTransactionEnvironment(void)
|
|
{
|
|
VirtualTransactionId vxid;
|
|
HASHCTL hash_ctl;
|
|
|
|
Assert(RecoveryLockHash == NULL); /* don't run this twice */
|
|
|
|
/*
|
|
* Initialize the hash tables for tracking the locks held by each
|
|
* transaction.
|
|
*/
|
|
hash_ctl.keysize = sizeof(xl_standby_lock);
|
|
hash_ctl.entrysize = sizeof(RecoveryLockEntry);
|
|
RecoveryLockHash = hash_create("RecoveryLockHash",
|
|
64,
|
|
&hash_ctl,
|
|
HASH_ELEM | HASH_BLOBS);
|
|
hash_ctl.keysize = sizeof(TransactionId);
|
|
hash_ctl.entrysize = sizeof(RecoveryLockXidEntry);
|
|
RecoveryLockXidHash = hash_create("RecoveryLockXidHash",
|
|
64,
|
|
&hash_ctl,
|
|
HASH_ELEM | HASH_BLOBS);
|
|
|
|
/*
|
|
* Initialize shared invalidation management for Startup process, being
|
|
* careful to register ourselves as a sendOnly process so we don't need to
|
|
* read messages, nor will we get signaled when the queue starts filling
|
|
* up.
|
|
*/
|
|
SharedInvalBackendInit(true);
|
|
|
|
/*
|
|
* Lock a virtual transaction id for Startup process.
|
|
*
|
|
* We need to do GetNextLocalTransactionId() because
|
|
* SharedInvalBackendInit() leaves localTransactionId invalid and the lock
|
|
* manager doesn't like that at all.
|
|
*
|
|
* Note that we don't need to run XactLockTableInsert() because nobody
|
|
* needs to wait on xids. That sounds a little strange, but table locks
|
|
* are held by vxids and row level locks are held by xids. All queries
|
|
* hold AccessShareLocks so never block while we write or lock new rows.
|
|
*/
|
|
MyProc->vxid.procNumber = MyProcNumber;
|
|
vxid.procNumber = MyProcNumber;
|
|
vxid.localTransactionId = GetNextLocalTransactionId();
|
|
VirtualXactLockTableInsert(vxid);
|
|
|
|
standbyState = STANDBY_INITIALIZED;
|
|
}
|
|
|
|
/*
|
|
* ShutdownRecoveryTransactionEnvironment
|
|
* Shut down transaction tracking
|
|
*
|
|
* Prepare to switch from hot standby mode to normal operation. Shut down
|
|
* recovery-time transaction tracking.
|
|
*
|
|
* This must be called even in shutdown of startup process if transaction
|
|
* tracking has been initialized. Otherwise some locks the tracked
|
|
* transactions were holding will not be released and may interfere with
|
|
* the processes still running (but will exit soon later) at the exit of
|
|
* startup process.
|
|
*/
|
|
void
|
|
ShutdownRecoveryTransactionEnvironment(void)
|
|
{
|
|
/*
|
|
* Do nothing if RecoveryLockHash is NULL because that means that
|
|
* transaction tracking has not yet been initialized or has already been
|
|
* shut down. This makes it safe to have possibly-redundant calls of this
|
|
* function during process exit.
|
|
*/
|
|
if (RecoveryLockHash == NULL)
|
|
return;
|
|
|
|
/* Mark all tracked in-progress transactions as finished. */
|
|
ExpireAllKnownAssignedTransactionIds();
|
|
|
|
/* Release all locks the tracked transactions were holding */
|
|
StandbyReleaseAllLocks();
|
|
|
|
/* Destroy the lock hash tables. */
|
|
hash_destroy(RecoveryLockHash);
|
|
hash_destroy(RecoveryLockXidHash);
|
|
RecoveryLockHash = NULL;
|
|
RecoveryLockXidHash = NULL;
|
|
|
|
/* Cleanup our VirtualTransaction */
|
|
VirtualXactLockTableCleanup();
|
|
}
|
|
|
|
|
|
/*
|
|
* -----------------------------------------------------
|
|
* Standby wait timers and backend cancel logic
|
|
* -----------------------------------------------------
|
|
*/
|
|
|
|
/*
|
|
* Determine the cutoff time at which we want to start canceling conflicting
|
|
* transactions. Returns zero (a time safely in the past) if we are willing
|
|
* to wait forever.
|
|
*/
|
|
static TimestampTz
|
|
GetStandbyLimitTime(void)
|
|
{
|
|
TimestampTz rtime;
|
|
bool fromStream;
|
|
|
|
/*
|
|
* The cutoff time is the last WAL data receipt time plus the appropriate
|
|
* delay variable. Delay of -1 means wait forever.
|
|
*/
|
|
GetXLogReceiptTime(&rtime, &fromStream);
|
|
if (fromStream)
|
|
{
|
|
if (max_standby_streaming_delay < 0)
|
|
return 0; /* wait forever */
|
|
return TimestampTzPlusMilliseconds(rtime, max_standby_streaming_delay);
|
|
}
|
|
else
|
|
{
|
|
if (max_standby_archive_delay < 0)
|
|
return 0; /* wait forever */
|
|
return TimestampTzPlusMilliseconds(rtime, max_standby_archive_delay);
|
|
}
|
|
}
|
|
|
|
#define STANDBY_INITIAL_WAIT_US 1000
|
|
static int standbyWait_us = STANDBY_INITIAL_WAIT_US;
|
|
|
|
/*
|
|
* Standby wait logic for ResolveRecoveryConflictWithVirtualXIDs.
|
|
* We wait here for a while then return. If we decide we can't wait any
|
|
* more then we return true, if we can wait some more return false.
|
|
*/
|
|
static bool
|
|
WaitExceedsMaxStandbyDelay(uint32 wait_event_info)
|
|
{
|
|
TimestampTz ltime;
|
|
|
|
CHECK_FOR_INTERRUPTS();
|
|
|
|
/* Are we past the limit time? */
|
|
ltime = GetStandbyLimitTime();
|
|
if (ltime && GetCurrentTimestamp() >= ltime)
|
|
return true;
|
|
|
|
/*
|
|
* Sleep a bit (this is essential to avoid busy-waiting).
|
|
*/
|
|
pgstat_report_wait_start(wait_event_info);
|
|
pg_usleep(standbyWait_us);
|
|
pgstat_report_wait_end();
|
|
|
|
/*
|
|
* Progressively increase the sleep times, but not to more than 1s, since
|
|
* pg_usleep isn't interruptible on some platforms.
|
|
*/
|
|
standbyWait_us *= 2;
|
|
if (standbyWait_us > 1000000)
|
|
standbyWait_us = 1000000;
|
|
|
|
return false;
|
|
}
|
|
|
|
/*
|
|
* Log the recovery conflict.
|
|
*
|
|
* wait_start is the timestamp when the caller started to wait.
|
|
* now is the timestamp when this function has been called.
|
|
* wait_list is the list of virtual transaction ids assigned to
|
|
* conflicting processes. still_waiting indicates whether
|
|
* the startup process is still waiting for the recovery conflict
|
|
* to be resolved or not.
|
|
*/
|
|
void
|
|
LogRecoveryConflict(ProcSignalReason reason, TimestampTz wait_start,
|
|
TimestampTz now, VirtualTransactionId *wait_list,
|
|
bool still_waiting)
|
|
{
|
|
long secs;
|
|
int usecs;
|
|
long msecs;
|
|
StringInfoData buf;
|
|
int nprocs = 0;
|
|
|
|
/*
|
|
* There must be no conflicting processes when the recovery conflict has
|
|
* already been resolved.
|
|
*/
|
|
Assert(still_waiting || wait_list == NULL);
|
|
|
|
TimestampDifference(wait_start, now, &secs, &usecs);
|
|
msecs = secs * 1000 + usecs / 1000;
|
|
usecs = usecs % 1000;
|
|
|
|
if (wait_list)
|
|
{
|
|
VirtualTransactionId *vxids;
|
|
|
|
/* Construct a string of list of the conflicting processes */
|
|
vxids = wait_list;
|
|
while (VirtualTransactionIdIsValid(*vxids))
|
|
{
|
|
PGPROC *proc = ProcNumberGetProc(vxids->procNumber);
|
|
|
|
/* proc can be NULL if the target backend is not active */
|
|
if (proc)
|
|
{
|
|
if (nprocs == 0)
|
|
{
|
|
initStringInfo(&buf);
|
|
appendStringInfo(&buf, "%d", proc->pid);
|
|
}
|
|
else
|
|
appendStringInfo(&buf, ", %d", proc->pid);
|
|
|
|
nprocs++;
|
|
}
|
|
|
|
vxids++;
|
|
}
|
|
}
|
|
|
|
/*
|
|
* If wait_list is specified, report the list of PIDs of active
|
|
* conflicting backends in a detail message. Note that if all the backends
|
|
* in the list are not active, no detail message is logged.
|
|
*/
|
|
if (still_waiting)
|
|
{
|
|
ereport(LOG,
|
|
errmsg("recovery still waiting after %ld.%03d ms: %s",
|
|
msecs, usecs, get_recovery_conflict_desc(reason)),
|
|
nprocs > 0 ? errdetail_log_plural("Conflicting process: %s.",
|
|
"Conflicting processes: %s.",
|
|
nprocs, buf.data) : 0);
|
|
}
|
|
else
|
|
{
|
|
ereport(LOG,
|
|
errmsg("recovery finished waiting after %ld.%03d ms: %s",
|
|
msecs, usecs, get_recovery_conflict_desc(reason)));
|
|
}
|
|
|
|
if (nprocs > 0)
|
|
pfree(buf.data);
|
|
}
|
|
|
|
/*
|
|
* This is the main executioner for any query backend that conflicts with
|
|
* recovery processing. Judgement has already been passed on it within
|
|
* a specific rmgr. Here we just issue the orders to the procs. The procs
|
|
* then throw the required error as instructed.
|
|
*
|
|
* If report_waiting is true, "waiting" is reported in PS display and the
|
|
* wait for recovery conflict is reported in the log, if necessary. If
|
|
* the caller is responsible for reporting them, report_waiting should be
|
|
* false. Otherwise, both the caller and this function report the same
|
|
* thing unexpectedly.
|
|
*/
|
|
static void
|
|
ResolveRecoveryConflictWithVirtualXIDs(VirtualTransactionId *waitlist,
|
|
ProcSignalReason reason, uint32 wait_event_info,
|
|
bool report_waiting)
|
|
{
|
|
TimestampTz waitStart = 0;
|
|
bool waiting = false;
|
|
bool logged_recovery_conflict = false;
|
|
|
|
/* Fast exit, to avoid a kernel call if there's no work to be done. */
|
|
if (!VirtualTransactionIdIsValid(*waitlist))
|
|
return;
|
|
|
|
/* Set the wait start timestamp for reporting */
|
|
if (report_waiting && (log_recovery_conflict_waits || update_process_title))
|
|
waitStart = GetCurrentTimestamp();
|
|
|
|
while (VirtualTransactionIdIsValid(*waitlist))
|
|
{
|
|
/* reset standbyWait_us for each xact we wait for */
|
|
standbyWait_us = STANDBY_INITIAL_WAIT_US;
|
|
|
|
/* wait until the virtual xid is gone */
|
|
while (!VirtualXactLock(*waitlist, false))
|
|
{
|
|
/* Is it time to kill it? */
|
|
if (WaitExceedsMaxStandbyDelay(wait_event_info))
|
|
{
|
|
pid_t pid;
|
|
|
|
/*
|
|
* Now find out who to throw out of the balloon.
|
|
*/
|
|
Assert(VirtualTransactionIdIsValid(*waitlist));
|
|
pid = CancelVirtualTransaction(*waitlist, reason);
|
|
|
|
/*
|
|
* Wait a little bit for it to die so that we avoid flooding
|
|
* an unresponsive backend when system is heavily loaded.
|
|
*/
|
|
if (pid != 0)
|
|
pg_usleep(5000L);
|
|
}
|
|
|
|
if (waitStart != 0 && (!logged_recovery_conflict || !waiting))
|
|
{
|
|
TimestampTz now = 0;
|
|
bool maybe_log_conflict;
|
|
bool maybe_update_title;
|
|
|
|
maybe_log_conflict = (log_recovery_conflict_waits && !logged_recovery_conflict);
|
|
maybe_update_title = (update_process_title && !waiting);
|
|
|
|
/* Get the current timestamp if not report yet */
|
|
if (maybe_log_conflict || maybe_update_title)
|
|
now = GetCurrentTimestamp();
|
|
|
|
/*
|
|
* Report via ps if we have been waiting for more than 500
|
|
* msec (should that be configurable?)
|
|
*/
|
|
if (maybe_update_title &&
|
|
TimestampDifferenceExceeds(waitStart, now, 500))
|
|
{
|
|
set_ps_display_suffix("waiting");
|
|
waiting = true;
|
|
}
|
|
|
|
/*
|
|
* Emit the log message if the startup process is waiting
|
|
* longer than deadlock_timeout for recovery conflict.
|
|
*/
|
|
if (maybe_log_conflict &&
|
|
TimestampDifferenceExceeds(waitStart, now, DeadlockTimeout))
|
|
{
|
|
LogRecoveryConflict(reason, waitStart, now, waitlist, true);
|
|
logged_recovery_conflict = true;
|
|
}
|
|
}
|
|
}
|
|
|
|
/* The virtual transaction is gone now, wait for the next one */
|
|
waitlist++;
|
|
}
|
|
|
|
/*
|
|
* Emit the log message if recovery conflict was resolved but the startup
|
|
* process waited longer than deadlock_timeout for it.
|
|
*/
|
|
if (logged_recovery_conflict)
|
|
LogRecoveryConflict(reason, waitStart, GetCurrentTimestamp(),
|
|
NULL, false);
|
|
|
|
/* reset ps display to remove the suffix if we added one */
|
|
if (waiting)
|
|
set_ps_display_remove_suffix();
|
|
|
|
}
|
|
|
|
/*
|
|
* Generate whatever recovery conflicts are needed to eliminate snapshots that
|
|
* might see XIDs <= snapshotConflictHorizon as still running.
|
|
*
|
|
* snapshotConflictHorizon cutoffs are our standard approach to generating
|
|
* granular recovery conflicts. Note that InvalidTransactionId values are
|
|
* interpreted as "definitely don't need any conflicts" here, which is a
|
|
* general convention that WAL records can (and often do) depend on.
|
|
*/
|
|
void
|
|
ResolveRecoveryConflictWithSnapshot(TransactionId snapshotConflictHorizon,
|
|
bool isCatalogRel,
|
|
RelFileLocator locator)
|
|
{
|
|
VirtualTransactionId *backends;
|
|
|
|
/*
|
|
* If we get passed InvalidTransactionId then we do nothing (no conflict).
|
|
*
|
|
* This can happen when replaying already-applied WAL records after a
|
|
* standby crash or restart, or when replaying an XLOG_HEAP2_VISIBLE
|
|
* record that marks as frozen a page which was already all-visible. It's
|
|
* also quite common with records generated during index deletion
|
|
* (original execution of the deletion can reason that a recovery conflict
|
|
* which is sufficient for the deletion operation must take place before
|
|
* replay of the deletion record itself).
|
|
*/
|
|
if (!TransactionIdIsValid(snapshotConflictHorizon))
|
|
return;
|
|
|
|
Assert(TransactionIdIsNormal(snapshotConflictHorizon));
|
|
backends = GetConflictingVirtualXIDs(snapshotConflictHorizon,
|
|
locator.dbOid);
|
|
ResolveRecoveryConflictWithVirtualXIDs(backends,
|
|
PROCSIG_RECOVERY_CONFLICT_SNAPSHOT,
|
|
WAIT_EVENT_RECOVERY_CONFLICT_SNAPSHOT,
|
|
true);
|
|
|
|
/*
|
|
* Note that WaitExceedsMaxStandbyDelay() is not taken into account here
|
|
* (as opposed to ResolveRecoveryConflictWithVirtualXIDs() above). That
|
|
* seems OK, given that this kind of conflict should not normally be
|
|
* reached, e.g. due to using a physical replication slot.
|
|
*/
|
|
if (wal_level >= WAL_LEVEL_LOGICAL && isCatalogRel)
|
|
InvalidateObsoleteReplicationSlots(RS_INVAL_HORIZON, 0, locator.dbOid,
|
|
snapshotConflictHorizon);
|
|
}
|
|
|
|
/*
|
|
* Variant of ResolveRecoveryConflictWithSnapshot that works with
|
|
* FullTransactionId values
|
|
*/
|
|
void
|
|
ResolveRecoveryConflictWithSnapshotFullXid(FullTransactionId snapshotConflictHorizon,
|
|
bool isCatalogRel,
|
|
RelFileLocator locator)
|
|
{
|
|
/*
|
|
* ResolveRecoveryConflictWithSnapshot operates on 32-bit TransactionIds,
|
|
* so truncate the logged FullTransactionId. If the logged value is very
|
|
* old, so that XID wrap-around already happened on it, there can't be any
|
|
* snapshots that still see it.
|
|
*/
|
|
FullTransactionId nextXid = ReadNextFullTransactionId();
|
|
uint64 diff;
|
|
|
|
diff = U64FromFullTransactionId(nextXid) -
|
|
U64FromFullTransactionId(snapshotConflictHorizon);
|
|
if (diff < MaxTransactionId / 2)
|
|
{
|
|
TransactionId truncated;
|
|
|
|
truncated = XidFromFullTransactionId(snapshotConflictHorizon);
|
|
ResolveRecoveryConflictWithSnapshot(truncated,
|
|
isCatalogRel,
|
|
locator);
|
|
}
|
|
}
|
|
|
|
void
|
|
ResolveRecoveryConflictWithTablespace(Oid tsid)
|
|
{
|
|
VirtualTransactionId *temp_file_users;
|
|
|
|
/*
|
|
* Standby users may be currently using this tablespace for their
|
|
* temporary files. We only care about current users because
|
|
* temp_tablespace parameter will just ignore tablespaces that no longer
|
|
* exist.
|
|
*
|
|
* Ask everybody to cancel their queries immediately so we can ensure no
|
|
* temp files remain and we can remove the tablespace. Nuke the entire
|
|
* site from orbit, it's the only way to be sure.
|
|
*
|
|
* XXX: We could work out the pids of active backends using this
|
|
* tablespace by examining the temp filenames in the directory. We would
|
|
* then convert the pids into VirtualXIDs before attempting to cancel
|
|
* them.
|
|
*
|
|
* We don't wait for commit because drop tablespace is non-transactional.
|
|
*/
|
|
temp_file_users = GetConflictingVirtualXIDs(InvalidTransactionId,
|
|
InvalidOid);
|
|
ResolveRecoveryConflictWithVirtualXIDs(temp_file_users,
|
|
PROCSIG_RECOVERY_CONFLICT_TABLESPACE,
|
|
WAIT_EVENT_RECOVERY_CONFLICT_TABLESPACE,
|
|
true);
|
|
}
|
|
|
|
void
|
|
ResolveRecoveryConflictWithDatabase(Oid dbid)
|
|
{
|
|
/*
|
|
* We don't do ResolveRecoveryConflictWithVirtualXIDs() here since that
|
|
* only waits for transactions and completely idle sessions would block
|
|
* us. This is rare enough that we do this as simply as possible: no wait,
|
|
* just force them off immediately.
|
|
*
|
|
* No locking is required here because we already acquired
|
|
* AccessExclusiveLock. Anybody trying to connect while we do this will
|
|
* block during InitPostgres() and then disconnect when they see the
|
|
* database has been removed.
|
|
*/
|
|
while (CountDBBackends(dbid) > 0)
|
|
{
|
|
CancelDBBackends(dbid, PROCSIG_RECOVERY_CONFLICT_DATABASE, true);
|
|
|
|
/*
|
|
* Wait awhile for them to die so that we avoid flooding an
|
|
* unresponsive backend when system is heavily loaded.
|
|
*/
|
|
pg_usleep(10000);
|
|
}
|
|
}
|
|
|
|
/*
|
|
* ResolveRecoveryConflictWithLock is called from ProcSleep()
|
|
* to resolve conflicts with other backends holding relation locks.
|
|
*
|
|
* The WaitLatch sleep normally done in ProcSleep()
|
|
* (when not InHotStandby) is performed here, for code clarity.
|
|
*
|
|
* We either resolve conflicts immediately or set a timeout to wake us at
|
|
* the limit of our patience.
|
|
*
|
|
* Resolve conflicts by canceling to all backends holding a conflicting
|
|
* lock. As we are already queued to be granted the lock, no new lock
|
|
* requests conflicting with ours will be granted in the meantime.
|
|
*
|
|
* We also must check for deadlocks involving the Startup process and
|
|
* hot-standby backend processes. If deadlock_timeout is reached in
|
|
* this function, all the backends holding the conflicting locks are
|
|
* requested to check themselves for deadlocks.
|
|
*
|
|
* logging_conflict should be true if the recovery conflict has not been
|
|
* logged yet even though logging is enabled. After deadlock_timeout is
|
|
* reached and the request for deadlock check is sent, we wait again to
|
|
* be signaled by the release of the lock if logging_conflict is false.
|
|
* Otherwise we return without waiting again so that the caller can report
|
|
* the recovery conflict. In this case, then, this function is called again
|
|
* with logging_conflict=false (because the recovery conflict has already
|
|
* been logged) and we will wait again for the lock to be released.
|
|
*/
|
|
void
|
|
ResolveRecoveryConflictWithLock(LOCKTAG locktag, bool logging_conflict)
|
|
{
|
|
TimestampTz ltime;
|
|
TimestampTz now;
|
|
|
|
Assert(InHotStandby);
|
|
|
|
ltime = GetStandbyLimitTime();
|
|
now = GetCurrentTimestamp();
|
|
|
|
/*
|
|
* Update waitStart if first time through after the startup process
|
|
* started waiting for the lock. It should not be updated every time
|
|
* ResolveRecoveryConflictWithLock() is called during the wait.
|
|
*
|
|
* Use the current time obtained for comparison with ltime as waitStart
|
|
* (i.e., the time when this process started waiting for the lock). Since
|
|
* getting the current time newly can cause overhead, we reuse the
|
|
* already-obtained time to avoid that overhead.
|
|
*
|
|
* Note that waitStart is updated without holding the lock table's
|
|
* partition lock, to avoid the overhead by additional lock acquisition.
|
|
* This can cause "waitstart" in pg_locks to become NULL for a very short
|
|
* period of time after the wait started even though "granted" is false.
|
|
* This is OK in practice because we can assume that users are likely to
|
|
* look at "waitstart" when waiting for the lock for a long time.
|
|
*/
|
|
if (pg_atomic_read_u64(&MyProc->waitStart) == 0)
|
|
pg_atomic_write_u64(&MyProc->waitStart, now);
|
|
|
|
if (now >= ltime && ltime != 0)
|
|
{
|
|
/*
|
|
* We're already behind, so clear a path as quickly as possible.
|
|
*/
|
|
VirtualTransactionId *backends;
|
|
|
|
backends = GetLockConflicts(&locktag, AccessExclusiveLock, NULL);
|
|
|
|
/*
|
|
* Prevent ResolveRecoveryConflictWithVirtualXIDs() from reporting
|
|
* "waiting" in PS display by disabling its argument report_waiting
|
|
* because the caller, WaitOnLock(), has already reported that.
|
|
*/
|
|
ResolveRecoveryConflictWithVirtualXIDs(backends,
|
|
PROCSIG_RECOVERY_CONFLICT_LOCK,
|
|
PG_WAIT_LOCK | locktag.locktag_type,
|
|
false);
|
|
}
|
|
else
|
|
{
|
|
/*
|
|
* Wait (or wait again) until ltime, and check for deadlocks as well
|
|
* if we will be waiting longer than deadlock_timeout
|
|
*/
|
|
EnableTimeoutParams timeouts[2];
|
|
int cnt = 0;
|
|
|
|
if (ltime != 0)
|
|
{
|
|
got_standby_lock_timeout = false;
|
|
timeouts[cnt].id = STANDBY_LOCK_TIMEOUT;
|
|
timeouts[cnt].type = TMPARAM_AT;
|
|
timeouts[cnt].fin_time = ltime;
|
|
cnt++;
|
|
}
|
|
|
|
got_standby_deadlock_timeout = false;
|
|
timeouts[cnt].id = STANDBY_DEADLOCK_TIMEOUT;
|
|
timeouts[cnt].type = TMPARAM_AFTER;
|
|
timeouts[cnt].delay_ms = DeadlockTimeout;
|
|
cnt++;
|
|
|
|
enable_timeouts(timeouts, cnt);
|
|
}
|
|
|
|
/* Wait to be signaled by the release of the Relation Lock */
|
|
ProcWaitForSignal(PG_WAIT_LOCK | locktag.locktag_type);
|
|
|
|
/*
|
|
* Exit if ltime is reached. Then all the backends holding conflicting
|
|
* locks will be canceled in the next ResolveRecoveryConflictWithLock()
|
|
* call.
|
|
*/
|
|
if (got_standby_lock_timeout)
|
|
goto cleanup;
|
|
|
|
if (got_standby_deadlock_timeout)
|
|
{
|
|
VirtualTransactionId *backends;
|
|
|
|
backends = GetLockConflicts(&locktag, AccessExclusiveLock, NULL);
|
|
|
|
/* Quick exit if there's no work to be done */
|
|
if (!VirtualTransactionIdIsValid(*backends))
|
|
goto cleanup;
|
|
|
|
/*
|
|
* Send signals to all the backends holding the conflicting locks, to
|
|
* ask them to check themselves for deadlocks.
|
|
*/
|
|
while (VirtualTransactionIdIsValid(*backends))
|
|
{
|
|
SignalVirtualTransaction(*backends,
|
|
PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK,
|
|
false);
|
|
backends++;
|
|
}
|
|
|
|
/*
|
|
* Exit if the recovery conflict has not been logged yet even though
|
|
* logging is enabled, so that the caller can log that. Then
|
|
* RecoveryConflictWithLock() is called again and we will wait again
|
|
* for the lock to be released.
|
|
*/
|
|
if (logging_conflict)
|
|
goto cleanup;
|
|
|
|
/*
|
|
* Wait again here to be signaled by the release of the Relation Lock,
|
|
* to prevent the subsequent RecoveryConflictWithLock() from causing
|
|
* deadlock_timeout and sending a request for deadlocks check again.
|
|
* Otherwise the request continues to be sent every deadlock_timeout
|
|
* until the relation locks are released or ltime is reached.
|
|
*/
|
|
got_standby_deadlock_timeout = false;
|
|
ProcWaitForSignal(PG_WAIT_LOCK | locktag.locktag_type);
|
|
}
|
|
|
|
cleanup:
|
|
|
|
/*
|
|
* Clear any timeout requests established above. We assume here that the
|
|
* Startup process doesn't have any other outstanding timeouts than those
|
|
* used by this function. If that stops being true, we could cancel the
|
|
* timeouts individually, but that'd be slower.
|
|
*/
|
|
disable_all_timeouts(false);
|
|
got_standby_lock_timeout = false;
|
|
got_standby_deadlock_timeout = false;
|
|
}
|
|
|
|
/*
|
|
* ResolveRecoveryConflictWithBufferPin is called from LockBufferForCleanup()
|
|
* to resolve conflicts with other backends holding buffer pins.
|
|
*
|
|
* The ProcWaitForSignal() sleep normally done in LockBufferForCleanup()
|
|
* (when not InHotStandby) is performed here, for code clarity.
|
|
*
|
|
* We either resolve conflicts immediately or set a timeout to wake us at
|
|
* the limit of our patience.
|
|
*
|
|
* Resolve conflicts by sending a PROCSIG signal to all backends to check if
|
|
* they hold one of the buffer pins that is blocking Startup process. If so,
|
|
* those backends will take an appropriate error action, ERROR or FATAL.
|
|
*
|
|
* We also must check for deadlocks. Deadlocks occur because if queries
|
|
* wait on a lock, that must be behind an AccessExclusiveLock, which can only
|
|
* be cleared if the Startup process replays a transaction completion record.
|
|
* If Startup process is also waiting then that is a deadlock. The deadlock
|
|
* can occur if the query is waiting and then the Startup sleeps, or if
|
|
* Startup is sleeping and the query waits on a lock. We protect against
|
|
* only the former sequence here, the latter sequence is checked prior to
|
|
* the query sleeping, in CheckRecoveryConflictDeadlock().
|
|
*
|
|
* Deadlocks are extremely rare, and relatively expensive to check for,
|
|
* so we don't do a deadlock check right away ... only if we have had to wait
|
|
* at least deadlock_timeout.
|
|
*/
|
|
void
|
|
ResolveRecoveryConflictWithBufferPin(void)
|
|
{
|
|
TimestampTz ltime;
|
|
|
|
Assert(InHotStandby);
|
|
|
|
ltime = GetStandbyLimitTime();
|
|
|
|
if (GetCurrentTimestamp() >= ltime && ltime != 0)
|
|
{
|
|
/*
|
|
* We're already behind, so clear a path as quickly as possible.
|
|
*/
|
|
SendRecoveryConflictWithBufferPin(PROCSIG_RECOVERY_CONFLICT_BUFFERPIN);
|
|
}
|
|
else
|
|
{
|
|
/*
|
|
* Wake up at ltime, and check for deadlocks as well if we will be
|
|
* waiting longer than deadlock_timeout
|
|
*/
|
|
EnableTimeoutParams timeouts[2];
|
|
int cnt = 0;
|
|
|
|
if (ltime != 0)
|
|
{
|
|
timeouts[cnt].id = STANDBY_TIMEOUT;
|
|
timeouts[cnt].type = TMPARAM_AT;
|
|
timeouts[cnt].fin_time = ltime;
|
|
cnt++;
|
|
}
|
|
|
|
got_standby_deadlock_timeout = false;
|
|
timeouts[cnt].id = STANDBY_DEADLOCK_TIMEOUT;
|
|
timeouts[cnt].type = TMPARAM_AFTER;
|
|
timeouts[cnt].delay_ms = DeadlockTimeout;
|
|
cnt++;
|
|
|
|
enable_timeouts(timeouts, cnt);
|
|
}
|
|
|
|
/*
|
|
* Wait to be signaled by UnpinBuffer() or for the wait to be interrupted
|
|
* by one of the timeouts established above.
|
|
*
|
|
* We assume that only UnpinBuffer() and the timeout requests established
|
|
* above can wake us up here. WakeupRecovery() called by walreceiver or
|
|
* SIGHUP signal handler, etc cannot do that because it uses the different
|
|
* latch from that ProcWaitForSignal() waits on.
|
|
*/
|
|
ProcWaitForSignal(WAIT_EVENT_BUFFER_PIN);
|
|
|
|
if (got_standby_delay_timeout)
|
|
SendRecoveryConflictWithBufferPin(PROCSIG_RECOVERY_CONFLICT_BUFFERPIN);
|
|
else if (got_standby_deadlock_timeout)
|
|
{
|
|
/*
|
|
* Send out a request for hot-standby backends to check themselves for
|
|
* deadlocks.
|
|
*
|
|
* XXX The subsequent ResolveRecoveryConflictWithBufferPin() will wait
|
|
* to be signaled by UnpinBuffer() again and send a request for
|
|
* deadlocks check if deadlock_timeout happens. This causes the
|
|
* request to continue to be sent every deadlock_timeout until the
|
|
* buffer is unpinned or ltime is reached. This would increase the
|
|
* workload in the startup process and backends. In practice it may
|
|
* not be so harmful because the period that the buffer is kept pinned
|
|
* is basically no so long. But we should fix this?
|
|
*/
|
|
SendRecoveryConflictWithBufferPin(PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK);
|
|
}
|
|
|
|
/*
|
|
* Clear any timeout requests established above. We assume here that the
|
|
* Startup process doesn't have any other timeouts than what this function
|
|
* uses. If that stops being true, we could cancel the timeouts
|
|
* individually, but that'd be slower.
|
|
*/
|
|
disable_all_timeouts(false);
|
|
got_standby_delay_timeout = false;
|
|
got_standby_deadlock_timeout = false;
|
|
}
|
|
|
|
static void
|
|
SendRecoveryConflictWithBufferPin(ProcSignalReason reason)
|
|
{
|
|
Assert(reason == PROCSIG_RECOVERY_CONFLICT_BUFFERPIN ||
|
|
reason == PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK);
|
|
|
|
/*
|
|
* We send signal to all backends to ask them if they are holding the
|
|
* buffer pin which is delaying the Startup process. We must not set the
|
|
* conflict flag yet, since most backends will be innocent. Let the
|
|
* SIGUSR1 handling in each backend decide their own fate.
|
|
*/
|
|
CancelDBBackends(InvalidOid, reason, false);
|
|
}
|
|
|
|
/*
|
|
* In Hot Standby perform early deadlock detection. We abort the lock
|
|
* wait if we are about to sleep while holding the buffer pin that Startup
|
|
* process is waiting for.
|
|
*
|
|
* Note: this code is pessimistic, because there is no way for it to
|
|
* determine whether an actual deadlock condition is present: the lock we
|
|
* need to wait for might be unrelated to any held by the Startup process.
|
|
* Sooner or later, this mechanism should get ripped out in favor of somehow
|
|
* accounting for buffer locks in DeadLockCheck(). However, errors here
|
|
* seem to be very low-probability in practice, so for now it's not worth
|
|
* the trouble.
|
|
*/
|
|
void
|
|
CheckRecoveryConflictDeadlock(void)
|
|
{
|
|
Assert(!InRecovery); /* do not call in Startup process */
|
|
|
|
if (!HoldingBufferPinThatDelaysRecovery())
|
|
return;
|
|
|
|
/*
|
|
* Error message should match ProcessInterrupts() but we avoid calling
|
|
* that because we aren't handling an interrupt at this point. Note that
|
|
* we only cancel the current transaction here, so if we are in a
|
|
* subtransaction and the pin is held by a parent, then the Startup
|
|
* process will continue to wait even though we have avoided deadlock.
|
|
*/
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_T_R_DEADLOCK_DETECTED),
|
|
errmsg("canceling statement due to conflict with recovery"),
|
|
errdetail("User transaction caused buffer deadlock with recovery.")));
|
|
}
|
|
|
|
|
|
/* --------------------------------
|
|
* timeout handler routines
|
|
* --------------------------------
|
|
*/
|
|
|
|
/*
|
|
* StandbyDeadLockHandler() will be called if STANDBY_DEADLOCK_TIMEOUT is
|
|
* exceeded.
|
|
*/
|
|
void
|
|
StandbyDeadLockHandler(void)
|
|
{
|
|
got_standby_deadlock_timeout = true;
|
|
}
|
|
|
|
/*
|
|
* StandbyTimeoutHandler() will be called if STANDBY_TIMEOUT is exceeded.
|
|
*/
|
|
void
|
|
StandbyTimeoutHandler(void)
|
|
{
|
|
got_standby_delay_timeout = true;
|
|
}
|
|
|
|
/*
|
|
* StandbyLockTimeoutHandler() will be called if STANDBY_LOCK_TIMEOUT is exceeded.
|
|
*/
|
|
void
|
|
StandbyLockTimeoutHandler(void)
|
|
{
|
|
got_standby_lock_timeout = true;
|
|
}
|
|
|
|
/*
|
|
* -----------------------------------------------------
|
|
* Locking in Recovery Mode
|
|
* -----------------------------------------------------
|
|
*
|
|
* All locks are held by the Startup process using a single virtual
|
|
* transaction. This implementation is both simpler and in some senses,
|
|
* more correct. The locks held mean "some original transaction held
|
|
* this lock, so query access is not allowed at this time". So the Startup
|
|
* process is the proxy by which the original locks are implemented.
|
|
*
|
|
* We only keep track of AccessExclusiveLocks, which are only ever held by
|
|
* one transaction on one relation.
|
|
*
|
|
* We keep a table of known locks in the RecoveryLockHash hash table.
|
|
* The point of that table is to let us efficiently de-duplicate locks,
|
|
* which is important because checkpoints will re-report the same locks
|
|
* already held. There is also a RecoveryLockXidHash table with one entry
|
|
* per xid, which allows us to efficiently find all the locks held by a
|
|
* given original transaction.
|
|
*
|
|
* We use session locks rather than normal locks so we don't need
|
|
* ResourceOwners.
|
|
*/
|
|
|
|
|
|
void
|
|
StandbyAcquireAccessExclusiveLock(TransactionId xid, Oid dbOid, Oid relOid)
|
|
{
|
|
RecoveryLockXidEntry *xidentry;
|
|
RecoveryLockEntry *lockentry;
|
|
xl_standby_lock key;
|
|
LOCKTAG locktag;
|
|
bool found;
|
|
|
|
/* Already processed? */
|
|
if (!TransactionIdIsValid(xid) ||
|
|
TransactionIdDidCommit(xid) ||
|
|
TransactionIdDidAbort(xid))
|
|
return;
|
|
|
|
elog(DEBUG4, "adding recovery lock: db %u rel %u", dbOid, relOid);
|
|
|
|
/* dbOid is InvalidOid when we are locking a shared relation. */
|
|
Assert(OidIsValid(relOid));
|
|
|
|
/* Create a hash entry for this xid, if we don't have one already. */
|
|
xidentry = hash_search(RecoveryLockXidHash, &xid, HASH_ENTER, &found);
|
|
if (!found)
|
|
{
|
|
Assert(xidentry->xid == xid); /* dynahash should have set this */
|
|
xidentry->head = NULL;
|
|
}
|
|
|
|
/* Create a hash entry for this lock, unless we have one already. */
|
|
key.xid = xid;
|
|
key.dbOid = dbOid;
|
|
key.relOid = relOid;
|
|
lockentry = hash_search(RecoveryLockHash, &key, HASH_ENTER, &found);
|
|
if (!found)
|
|
{
|
|
/* It's new, so link it into the XID's list ... */
|
|
lockentry->next = xidentry->head;
|
|
xidentry->head = lockentry;
|
|
|
|
/* ... and acquire the lock locally. */
|
|
SET_LOCKTAG_RELATION(locktag, dbOid, relOid);
|
|
|
|
(void) LockAcquire(&locktag, AccessExclusiveLock, true, false);
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Release all the locks associated with this RecoveryLockXidEntry.
|
|
*/
|
|
static void
|
|
StandbyReleaseXidEntryLocks(RecoveryLockXidEntry *xidentry)
|
|
{
|
|
RecoveryLockEntry *entry;
|
|
RecoveryLockEntry *next;
|
|
|
|
for (entry = xidentry->head; entry != NULL; entry = next)
|
|
{
|
|
LOCKTAG locktag;
|
|
|
|
elog(DEBUG4,
|
|
"releasing recovery lock: xid %u db %u rel %u",
|
|
entry->key.xid, entry->key.dbOid, entry->key.relOid);
|
|
/* Release the lock ... */
|
|
SET_LOCKTAG_RELATION(locktag, entry->key.dbOid, entry->key.relOid);
|
|
if (!LockRelease(&locktag, AccessExclusiveLock, true))
|
|
{
|
|
elog(LOG,
|
|
"RecoveryLockHash contains entry for lock no longer recorded by lock manager: xid %u database %u relation %u",
|
|
entry->key.xid, entry->key.dbOid, entry->key.relOid);
|
|
Assert(false);
|
|
}
|
|
/* ... and remove the per-lock hash entry */
|
|
next = entry->next;
|
|
hash_search(RecoveryLockHash, entry, HASH_REMOVE, NULL);
|
|
}
|
|
|
|
xidentry->head = NULL; /* just for paranoia */
|
|
}
|
|
|
|
/*
|
|
* Release locks for specific XID, or all locks if it's InvalidXid.
|
|
*/
|
|
static void
|
|
StandbyReleaseLocks(TransactionId xid)
|
|
{
|
|
RecoveryLockXidEntry *entry;
|
|
|
|
if (TransactionIdIsValid(xid))
|
|
{
|
|
if ((entry = hash_search(RecoveryLockXidHash, &xid, HASH_FIND, NULL)))
|
|
{
|
|
StandbyReleaseXidEntryLocks(entry);
|
|
hash_search(RecoveryLockXidHash, entry, HASH_REMOVE, NULL);
|
|
}
|
|
}
|
|
else
|
|
StandbyReleaseAllLocks();
|
|
}
|
|
|
|
/*
|
|
* Release locks for a transaction tree, starting at xid down, from
|
|
* RecoveryLockXidHash.
|
|
*
|
|
* Called during WAL replay of COMMIT/ROLLBACK when in hot standby mode,
|
|
* to remove any AccessExclusiveLocks requested by a transaction.
|
|
*/
|
|
void
|
|
StandbyReleaseLockTree(TransactionId xid, int nsubxids, TransactionId *subxids)
|
|
{
|
|
int i;
|
|
|
|
StandbyReleaseLocks(xid);
|
|
|
|
for (i = 0; i < nsubxids; i++)
|
|
StandbyReleaseLocks(subxids[i]);
|
|
}
|
|
|
|
/*
|
|
* Called at end of recovery and when we see a shutdown checkpoint.
|
|
*/
|
|
void
|
|
StandbyReleaseAllLocks(void)
|
|
{
|
|
HASH_SEQ_STATUS status;
|
|
RecoveryLockXidEntry *entry;
|
|
|
|
elog(DEBUG2, "release all standby locks");
|
|
|
|
hash_seq_init(&status, RecoveryLockXidHash);
|
|
while ((entry = hash_seq_search(&status)))
|
|
{
|
|
StandbyReleaseXidEntryLocks(entry);
|
|
hash_search(RecoveryLockXidHash, entry, HASH_REMOVE, NULL);
|
|
}
|
|
}
|
|
|
|
/*
|
|
* StandbyReleaseOldLocks
|
|
* Release standby locks held by top-level XIDs that aren't running,
|
|
* as long as they're not prepared transactions.
|
|
*
|
|
* This is needed to prune the locks of crashed transactions, which didn't
|
|
* write an ABORT/COMMIT record.
|
|
*/
|
|
void
|
|
StandbyReleaseOldLocks(TransactionId oldxid)
|
|
{
|
|
HASH_SEQ_STATUS status;
|
|
RecoveryLockXidEntry *entry;
|
|
|
|
hash_seq_init(&status, RecoveryLockXidHash);
|
|
while ((entry = hash_seq_search(&status)))
|
|
{
|
|
Assert(TransactionIdIsValid(entry->xid));
|
|
|
|
/* Skip if prepared transaction. */
|
|
if (StandbyTransactionIdIsPrepared(entry->xid))
|
|
continue;
|
|
|
|
/* Skip if >= oldxid. */
|
|
if (!TransactionIdPrecedes(entry->xid, oldxid))
|
|
continue;
|
|
|
|
/* Remove all locks and hash table entry. */
|
|
StandbyReleaseXidEntryLocks(entry);
|
|
hash_search(RecoveryLockXidHash, entry, HASH_REMOVE, NULL);
|
|
}
|
|
}
|
|
|
|
/*
|
|
* --------------------------------------------------------------------
|
|
* Recovery handling for Rmgr RM_STANDBY_ID
|
|
*
|
|
* These record types will only be created if XLogStandbyInfoActive()
|
|
* --------------------------------------------------------------------
|
|
*/
|
|
|
|
void
|
|
standby_redo(XLogReaderState *record)
|
|
{
|
|
uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
|
|
|
|
/* Backup blocks are not used in standby records */
|
|
Assert(!XLogRecHasAnyBlockRefs(record));
|
|
|
|
/* Do nothing if we're not in hot standby mode */
|
|
if (standbyState == STANDBY_DISABLED)
|
|
return;
|
|
|
|
if (info == XLOG_STANDBY_LOCK)
|
|
{
|
|
xl_standby_locks *xlrec = (xl_standby_locks *) XLogRecGetData(record);
|
|
int i;
|
|
|
|
for (i = 0; i < xlrec->nlocks; i++)
|
|
StandbyAcquireAccessExclusiveLock(xlrec->locks[i].xid,
|
|
xlrec->locks[i].dbOid,
|
|
xlrec->locks[i].relOid);
|
|
}
|
|
else if (info == XLOG_RUNNING_XACTS)
|
|
{
|
|
xl_running_xacts *xlrec = (xl_running_xacts *) XLogRecGetData(record);
|
|
RunningTransactionsData running;
|
|
|
|
running.xcnt = xlrec->xcnt;
|
|
running.subxcnt = xlrec->subxcnt;
|
|
running.subxid_status = xlrec->subxid_overflow ? SUBXIDS_MISSING : SUBXIDS_IN_ARRAY;
|
|
running.nextXid = xlrec->nextXid;
|
|
running.latestCompletedXid = xlrec->latestCompletedXid;
|
|
running.oldestRunningXid = xlrec->oldestRunningXid;
|
|
running.xids = xlrec->xids;
|
|
|
|
ProcArrayApplyRecoveryInfo(&running);
|
|
|
|
/*
|
|
* The startup process currently has no convenient way to schedule
|
|
* stats to be reported. XLOG_RUNNING_XACTS records issued at a
|
|
* regular cadence, making this a convenient location to report stats.
|
|
* While these records aren't generated with wal_level=minimal, stats
|
|
* also cannot be accessed during WAL replay.
|
|
*/
|
|
pgstat_report_stat(true);
|
|
}
|
|
else if (info == XLOG_INVALIDATIONS)
|
|
{
|
|
xl_invalidations *xlrec = (xl_invalidations *) XLogRecGetData(record);
|
|
|
|
ProcessCommittedInvalidationMessages(xlrec->msgs,
|
|
xlrec->nmsgs,
|
|
xlrec->relcacheInitFileInval,
|
|
xlrec->dbId,
|
|
xlrec->tsId);
|
|
}
|
|
else
|
|
elog(PANIC, "standby_redo: unknown op code %u", info);
|
|
}
|
|
|
|
/*
|
|
* Log details of the current snapshot to WAL. This allows the snapshot state
|
|
* to be reconstructed on the standby and for logical decoding.
|
|
*
|
|
* This is used for Hot Standby as follows:
|
|
*
|
|
* We can move directly to STANDBY_SNAPSHOT_READY at startup if we
|
|
* start from a shutdown checkpoint because we know nothing was running
|
|
* at that time and our recovery snapshot is known empty. In the more
|
|
* typical case of an online checkpoint we need to jump through a few
|
|
* hoops to get a correct recovery snapshot and this requires a two or
|
|
* sometimes a three stage process.
|
|
*
|
|
* The initial snapshot must contain all running xids and all current
|
|
* AccessExclusiveLocks at a point in time on the standby. Assembling
|
|
* that information while the server is running requires many and
|
|
* various LWLocks, so we choose to derive that information piece by
|
|
* piece and then re-assemble that info on the standby. When that
|
|
* information is fully assembled we move to STANDBY_SNAPSHOT_READY.
|
|
*
|
|
* Since locking on the primary when we derive the information is not
|
|
* strict, we note that there is a time window between the derivation and
|
|
* writing to WAL of the derived information. That allows race conditions
|
|
* that we must resolve, since xids and locks may enter or leave the
|
|
* snapshot during that window. This creates the issue that an xid or
|
|
* lock may start *after* the snapshot has been derived yet *before* the
|
|
* snapshot is logged in the running xacts WAL record. We resolve this by
|
|
* starting to accumulate changes at a point just prior to when we derive
|
|
* the snapshot on the primary, then ignore duplicates when we later apply
|
|
* the snapshot from the running xacts record. This is implemented during
|
|
* CreateCheckPoint() where we use the logical checkpoint location as
|
|
* our starting point and then write the running xacts record immediately
|
|
* before writing the main checkpoint WAL record. Since we always start
|
|
* up from a checkpoint and are immediately at our starting point, we
|
|
* unconditionally move to STANDBY_INITIALIZED. After this point we
|
|
* must do 4 things:
|
|
* * move shared nextXid forwards as we see new xids
|
|
* * extend the clog and subtrans with each new xid
|
|
* * keep track of uncommitted known assigned xids
|
|
* * keep track of uncommitted AccessExclusiveLocks
|
|
*
|
|
* When we see a commit/abort we must remove known assigned xids and locks
|
|
* from the completing transaction. Attempted removals that cannot locate
|
|
* an entry are expected and must not cause an error when we are in state
|
|
* STANDBY_INITIALIZED. This is implemented in StandbyReleaseLocks() and
|
|
* KnownAssignedXidsRemove().
|
|
*
|
|
* Later, when we apply the running xact data we must be careful to ignore
|
|
* transactions already committed, since those commits raced ahead when
|
|
* making WAL entries.
|
|
*
|
|
* For logical decoding only the running xacts information is needed;
|
|
* there's no need to look at the locking information, but it's logged anyway,
|
|
* as there's no independent knob to just enable logical decoding. For
|
|
* details of how this is used, check snapbuild.c's introductory comment.
|
|
*
|
|
*
|
|
* Returns the RecPtr of the last inserted record.
|
|
*/
|
|
XLogRecPtr
|
|
LogStandbySnapshot(void)
|
|
{
|
|
XLogRecPtr recptr;
|
|
RunningTransactions running;
|
|
xl_standby_lock *locks;
|
|
int nlocks;
|
|
|
|
Assert(XLogStandbyInfoActive());
|
|
|
|
#ifdef USE_INJECTION_POINTS
|
|
if (IS_INJECTION_POINT_ATTACHED("skip-log-running-xacts"))
|
|
{
|
|
/*
|
|
* This record could move slot's xmin forward during decoding, leading
|
|
* to unpredictable results, so skip it when requested by the test.
|
|
*/
|
|
return GetInsertRecPtr();
|
|
}
|
|
#endif
|
|
|
|
/*
|
|
* Get details of any AccessExclusiveLocks being held at the moment.
|
|
*/
|
|
locks = GetRunningTransactionLocks(&nlocks);
|
|
if (nlocks > 0)
|
|
LogAccessExclusiveLocks(nlocks, locks);
|
|
pfree(locks);
|
|
|
|
/*
|
|
* Log details of all in-progress transactions. This should be the last
|
|
* record we write, because standby will open up when it sees this.
|
|
*/
|
|
running = GetRunningTransactionData();
|
|
|
|
/*
|
|
* GetRunningTransactionData() acquired ProcArrayLock, we must release it.
|
|
* For Hot Standby this can be done before inserting the WAL record
|
|
* because ProcArrayApplyRecoveryInfo() rechecks the commit status using
|
|
* the clog. For logical decoding, though, the lock can't be released
|
|
* early because the clog might be "in the future" from the POV of the
|
|
* historic snapshot. This would allow for situations where we're waiting
|
|
* for the end of a transaction listed in the xl_running_xacts record
|
|
* which, according to the WAL, has committed before the xl_running_xacts
|
|
* record. Fortunately this routine isn't executed frequently, and it's
|
|
* only a shared lock.
|
|
*/
|
|
if (wal_level < WAL_LEVEL_LOGICAL)
|
|
LWLockRelease(ProcArrayLock);
|
|
|
|
recptr = LogCurrentRunningXacts(running);
|
|
|
|
/* Release lock if we kept it longer ... */
|
|
if (wal_level >= WAL_LEVEL_LOGICAL)
|
|
LWLockRelease(ProcArrayLock);
|
|
|
|
/* GetRunningTransactionData() acquired XidGenLock, we must release it */
|
|
LWLockRelease(XidGenLock);
|
|
|
|
return recptr;
|
|
}
|
|
|
|
/*
|
|
* Record an enhanced snapshot of running transactions into WAL.
|
|
*
|
|
* The definitions of RunningTransactionsData and xl_running_xacts are
|
|
* similar. We keep them separate because xl_running_xacts is a contiguous
|
|
* chunk of memory and never exists fully until it is assembled in WAL.
|
|
* The inserted records are marked as not being important for durability,
|
|
* to avoid triggering superfluous checkpoint / archiving activity.
|
|
*/
|
|
static XLogRecPtr
|
|
LogCurrentRunningXacts(RunningTransactions CurrRunningXacts)
|
|
{
|
|
xl_running_xacts xlrec;
|
|
XLogRecPtr recptr;
|
|
|
|
xlrec.xcnt = CurrRunningXacts->xcnt;
|
|
xlrec.subxcnt = CurrRunningXacts->subxcnt;
|
|
xlrec.subxid_overflow = (CurrRunningXacts->subxid_status != SUBXIDS_IN_ARRAY);
|
|
xlrec.nextXid = CurrRunningXacts->nextXid;
|
|
xlrec.oldestRunningXid = CurrRunningXacts->oldestRunningXid;
|
|
xlrec.latestCompletedXid = CurrRunningXacts->latestCompletedXid;
|
|
|
|
/* Header */
|
|
XLogBeginInsert();
|
|
XLogSetRecordFlags(XLOG_MARK_UNIMPORTANT);
|
|
XLogRegisterData(&xlrec, MinSizeOfXactRunningXacts);
|
|
|
|
/* array of TransactionIds */
|
|
if (xlrec.xcnt > 0)
|
|
XLogRegisterData(CurrRunningXacts->xids,
|
|
(xlrec.xcnt + xlrec.subxcnt) * sizeof(TransactionId));
|
|
|
|
recptr = XLogInsert(RM_STANDBY_ID, XLOG_RUNNING_XACTS);
|
|
|
|
if (xlrec.subxid_overflow)
|
|
elog(DEBUG2,
|
|
"snapshot of %d running transactions overflowed (lsn %X/%X oldest xid %u latest complete %u next xid %u)",
|
|
CurrRunningXacts->xcnt,
|
|
LSN_FORMAT_ARGS(recptr),
|
|
CurrRunningXacts->oldestRunningXid,
|
|
CurrRunningXacts->latestCompletedXid,
|
|
CurrRunningXacts->nextXid);
|
|
else
|
|
elog(DEBUG2,
|
|
"snapshot of %d+%d running transaction ids (lsn %X/%X oldest xid %u latest complete %u next xid %u)",
|
|
CurrRunningXacts->xcnt, CurrRunningXacts->subxcnt,
|
|
LSN_FORMAT_ARGS(recptr),
|
|
CurrRunningXacts->oldestRunningXid,
|
|
CurrRunningXacts->latestCompletedXid,
|
|
CurrRunningXacts->nextXid);
|
|
|
|
/*
|
|
* Ensure running_xacts information is synced to disk not too far in the
|
|
* future. We don't want to stall anything though (i.e. use XLogFlush()),
|
|
* so we let the wal writer do it during normal operation.
|
|
* XLogSetAsyncXactLSN() conveniently will mark the LSN as to-be-synced
|
|
* and nudge the WALWriter into action if sleeping. Check
|
|
* XLogBackgroundFlush() for details why a record might not be flushed
|
|
* without it.
|
|
*/
|
|
XLogSetAsyncXactLSN(recptr);
|
|
|
|
return recptr;
|
|
}
|
|
|
|
/*
|
|
* Wholesale logging of AccessExclusiveLocks. Other lock types need not be
|
|
* logged, as described in backend/storage/lmgr/README.
|
|
*/
|
|
static void
|
|
LogAccessExclusiveLocks(int nlocks, xl_standby_lock *locks)
|
|
{
|
|
xl_standby_locks xlrec;
|
|
|
|
xlrec.nlocks = nlocks;
|
|
|
|
XLogBeginInsert();
|
|
XLogRegisterData(&xlrec, offsetof(xl_standby_locks, locks));
|
|
XLogRegisterData(locks, nlocks * sizeof(xl_standby_lock));
|
|
XLogSetRecordFlags(XLOG_MARK_UNIMPORTANT);
|
|
|
|
(void) XLogInsert(RM_STANDBY_ID, XLOG_STANDBY_LOCK);
|
|
}
|
|
|
|
/*
|
|
* Individual logging of AccessExclusiveLocks for use during LockAcquire()
|
|
*/
|
|
void
|
|
LogAccessExclusiveLock(Oid dbOid, Oid relOid)
|
|
{
|
|
xl_standby_lock xlrec;
|
|
|
|
xlrec.xid = GetCurrentTransactionId();
|
|
|
|
xlrec.dbOid = dbOid;
|
|
xlrec.relOid = relOid;
|
|
|
|
LogAccessExclusiveLocks(1, &xlrec);
|
|
MyXactFlags |= XACT_FLAGS_ACQUIREDACCESSEXCLUSIVELOCK;
|
|
}
|
|
|
|
/*
|
|
* Prepare to log an AccessExclusiveLock, for use during LockAcquire()
|
|
*/
|
|
void
|
|
LogAccessExclusiveLockPrepare(void)
|
|
{
|
|
/*
|
|
* Ensure that a TransactionId has been assigned to this transaction, for
|
|
* two reasons, both related to lock release on the standby. First, we
|
|
* must assign an xid so that RecordTransactionCommit() and
|
|
* RecordTransactionAbort() do not optimise away the transaction
|
|
* completion record which recovery relies upon to release locks. It's a
|
|
* hack, but for a corner case not worth adding code for into the main
|
|
* commit path. Second, we must assign an xid before the lock is recorded
|
|
* in shared memory, otherwise a concurrently executing
|
|
* GetRunningTransactionLocks() might see a lock associated with an
|
|
* InvalidTransactionId which we later assert cannot happen.
|
|
*/
|
|
(void) GetCurrentTransactionId();
|
|
}
|
|
|
|
/*
|
|
* Emit WAL for invalidations. This currently is only used for commits without
|
|
* an xid but which contain invalidations.
|
|
*/
|
|
void
|
|
LogStandbyInvalidations(int nmsgs, SharedInvalidationMessage *msgs,
|
|
bool relcacheInitFileInval)
|
|
{
|
|
xl_invalidations xlrec;
|
|
|
|
/* prepare record */
|
|
memset(&xlrec, 0, sizeof(xlrec));
|
|
xlrec.dbId = MyDatabaseId;
|
|
xlrec.tsId = MyDatabaseTableSpace;
|
|
xlrec.relcacheInitFileInval = relcacheInitFileInval;
|
|
xlrec.nmsgs = nmsgs;
|
|
|
|
/* perform insertion */
|
|
XLogBeginInsert();
|
|
XLogRegisterData(&xlrec, MinSizeOfInvalidations);
|
|
XLogRegisterData(msgs,
|
|
nmsgs * sizeof(SharedInvalidationMessage));
|
|
XLogInsert(RM_STANDBY_ID, XLOG_INVALIDATIONS);
|
|
}
|
|
|
|
/* Return the description of recovery conflict */
|
|
static const char *
|
|
get_recovery_conflict_desc(ProcSignalReason reason)
|
|
{
|
|
const char *reasonDesc = _("unknown reason");
|
|
|
|
switch (reason)
|
|
{
|
|
case PROCSIG_RECOVERY_CONFLICT_BUFFERPIN:
|
|
reasonDesc = _("recovery conflict on buffer pin");
|
|
break;
|
|
case PROCSIG_RECOVERY_CONFLICT_LOCK:
|
|
reasonDesc = _("recovery conflict on lock");
|
|
break;
|
|
case PROCSIG_RECOVERY_CONFLICT_TABLESPACE:
|
|
reasonDesc = _("recovery conflict on tablespace");
|
|
break;
|
|
case PROCSIG_RECOVERY_CONFLICT_SNAPSHOT:
|
|
reasonDesc = _("recovery conflict on snapshot");
|
|
break;
|
|
case PROCSIG_RECOVERY_CONFLICT_LOGICALSLOT:
|
|
reasonDesc = _("recovery conflict on replication slot");
|
|
break;
|
|
case PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK:
|
|
reasonDesc = _("recovery conflict on buffer deadlock");
|
|
break;
|
|
case PROCSIG_RECOVERY_CONFLICT_DATABASE:
|
|
reasonDesc = _("recovery conflict on database");
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
|
|
return reasonDesc;
|
|
}
|