mirror of
https://github.com/postgres/postgres.git
synced 2025-05-21 15:54:08 +03:00
929 lines
27 KiB
C
929 lines
27 KiB
C
/*-------------------------------------------------------------------------
|
|
*
|
|
* standby.c
|
|
* Misc functions used in Hot Standby mode.
|
|
*
|
|
* All functions for handling RM_STANDBY_ID, which relate to
|
|
* AccessExclusiveLocks and starting snapshots for Hot Standby mode.
|
|
* Plus conflict recovery processing.
|
|
*
|
|
* Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
|
|
* Portions Copyright (c) 1994, Regents of the University of California
|
|
*
|
|
* IDENTIFICATION
|
|
* $PostgreSQL: pgsql/src/backend/storage/ipc/standby.c,v 1.16 2010/04/06 10:50:57 sriggs Exp $
|
|
*
|
|
*-------------------------------------------------------------------------
|
|
*/
|
|
#include "postgres.h"
|
|
#include "access/transam.h"
|
|
#include "access/twophase.h"
|
|
#include "access/xact.h"
|
|
#include "access/xlog.h"
|
|
#include "miscadmin.h"
|
|
#include "pgstat.h"
|
|
#include "storage/bufmgr.h"
|
|
#include "storage/lmgr.h"
|
|
#include "storage/proc.h"
|
|
#include "storage/procarray.h"
|
|
#include "storage/sinvaladt.h"
|
|
#include "storage/standby.h"
|
|
#include "utils/ps_status.h"
|
|
|
|
int vacuum_defer_cleanup_age;
|
|
|
|
static List *RecoveryLockList;
|
|
|
|
static void ResolveRecoveryConflictWithVirtualXIDs(VirtualTransactionId *waitlist,
|
|
ProcSignalReason reason);
|
|
static void ResolveRecoveryConflictWithLock(Oid dbOid, Oid relOid);
|
|
static void LogCurrentRunningXacts(RunningTransactions CurrRunningXacts);
|
|
static void LogAccessExclusiveLocks(int nlocks, xl_standby_lock *locks);
|
|
|
|
/*
|
|
* InitRecoveryTransactionEnvironment
|
|
* Initiallize tracking of in-progress transactions in master
|
|
*
|
|
* We need to issue shared invalidations and hold locks. Holding locks
|
|
* means others may want to wait on us, so we need to make lock table
|
|
* inserts to appear like a transaction. We could create and delete
|
|
* lock table entries for each transaction but its simpler just to create
|
|
* one permanent entry and leave it there all the time. Locks are then
|
|
* acquired and released as needed. Yes, this means you can see the
|
|
* Startup process in pg_locks once we have run this.
|
|
*/
|
|
void
|
|
InitRecoveryTransactionEnvironment(void)
|
|
{
|
|
VirtualTransactionId vxid;
|
|
|
|
/*
|
|
* Initialise shared invalidation management for Startup process, being
|
|
* careful to register ourselves as a sendOnly process so we don't need to
|
|
* read messages, nor will we get signalled when the queue starts filling
|
|
* up.
|
|
*/
|
|
SharedInvalBackendInit(true);
|
|
|
|
/*
|
|
* Record the PID and PGPROC structure of the startup process.
|
|
*/
|
|
PublishStartupProcessInformation();
|
|
|
|
/*
|
|
* Lock a virtual transaction id for Startup process.
|
|
*
|
|
* We need to do GetNextLocalTransactionId() because
|
|
* SharedInvalBackendInit() leaves localTransactionid invalid and the lock
|
|
* manager doesn't like that at all.
|
|
*
|
|
* Note that we don't need to run XactLockTableInsert() because nobody
|
|
* needs to wait on xids. That sounds a little strange, but table locks
|
|
* are held by vxids and row level locks are held by xids. All queries
|
|
* hold AccessShareLocks so never block while we write or lock new rows.
|
|
*/
|
|
vxid.backendId = MyBackendId;
|
|
vxid.localTransactionId = GetNextLocalTransactionId();
|
|
VirtualXactLockTableInsert(vxid);
|
|
|
|
standbyState = STANDBY_INITIALIZED;
|
|
}
|
|
|
|
/*
|
|
* ShutdownRecoveryTransactionEnvironment
|
|
* Shut down transaction tracking
|
|
*
|
|
* Prepare to switch from hot standby mode to normal operation. Shut down
|
|
* recovery-time transaction tracking.
|
|
*/
|
|
void
|
|
ShutdownRecoveryTransactionEnvironment(void)
|
|
{
|
|
/* Mark all tracked in-progress transactions as finished. */
|
|
ExpireAllKnownAssignedTransactionIds();
|
|
|
|
/* Release all locks the tracked transactions were holding */
|
|
StandbyReleaseAllLocks();
|
|
}
|
|
|
|
|
|
/*
|
|
* -----------------------------------------------------
|
|
* Standby wait timers and backend cancel logic
|
|
* -----------------------------------------------------
|
|
*/
|
|
|
|
#define STANDBY_INITIAL_WAIT_US 1000
|
|
static int standbyWait_us = STANDBY_INITIAL_WAIT_US;
|
|
|
|
/*
|
|
* Standby wait logic for ResolveRecoveryConflictWithVirtualXIDs.
|
|
* We wait here for a while then return. If we decide we can't wait any
|
|
* more then we return true, if we can wait some more return false.
|
|
*/
|
|
static bool
|
|
WaitExceedsMaxStandbyDelay(void)
|
|
{
|
|
long delay_secs;
|
|
int delay_usecs;
|
|
|
|
if (MaxStandbyDelay == -1)
|
|
return false;
|
|
|
|
/* Are we past max_standby_delay? */
|
|
TimestampDifference(GetLatestXLogTime(), GetCurrentTimestamp(),
|
|
&delay_secs, &delay_usecs);
|
|
if (delay_secs > MaxStandbyDelay)
|
|
return true;
|
|
|
|
/*
|
|
* Sleep, then do bookkeeping.
|
|
*/
|
|
pg_usleep(standbyWait_us);
|
|
|
|
/*
|
|
* Progressively increase the sleep times.
|
|
*/
|
|
standbyWait_us *= 2;
|
|
if (standbyWait_us > 1000000)
|
|
standbyWait_us = 1000000;
|
|
if (standbyWait_us > MaxStandbyDelay * 1000000 / 4)
|
|
standbyWait_us = MaxStandbyDelay * 1000000 / 4;
|
|
|
|
return false;
|
|
}
|
|
|
|
/*
|
|
* This is the main executioner for any query backend that conflicts with
|
|
* recovery processing. Judgement has already been passed on it within
|
|
* a specific rmgr. Here we just issue the orders to the procs. The procs
|
|
* then throw the required error as instructed.
|
|
*/
|
|
static void
|
|
ResolveRecoveryConflictWithVirtualXIDs(VirtualTransactionId *waitlist,
|
|
ProcSignalReason reason)
|
|
{
|
|
char waitactivitymsg[100];
|
|
char oldactivitymsg[101];
|
|
|
|
while (VirtualTransactionIdIsValid(*waitlist))
|
|
{
|
|
long wait_s;
|
|
int wait_us; /* wait in microseconds (us) */
|
|
TimestampTz waitStart;
|
|
bool logged;
|
|
|
|
waitStart = GetCurrentTimestamp();
|
|
standbyWait_us = STANDBY_INITIAL_WAIT_US;
|
|
logged = false;
|
|
|
|
/* wait until the virtual xid is gone */
|
|
while (!ConditionalVirtualXactLockTableWait(*waitlist))
|
|
{
|
|
/*
|
|
* Report if we have been waiting for a while now...
|
|
*/
|
|
TimestampTz now = GetCurrentTimestamp();
|
|
|
|
TimestampDifference(waitStart, now, &wait_s, &wait_us);
|
|
if (!logged && (wait_s > 0 || wait_us > 500000))
|
|
{
|
|
const char *oldactivitymsgp;
|
|
int len;
|
|
|
|
oldactivitymsgp = get_ps_display(&len);
|
|
|
|
if (len > 100)
|
|
len = 100;
|
|
|
|
memcpy(oldactivitymsg, oldactivitymsgp, len);
|
|
oldactivitymsg[len] = 0;
|
|
|
|
snprintf(waitactivitymsg, sizeof(waitactivitymsg),
|
|
"waiting for max_standby_delay (%u s)",
|
|
MaxStandbyDelay);
|
|
set_ps_display(waitactivitymsg, false);
|
|
|
|
pgstat_report_waiting(true);
|
|
|
|
logged = true;
|
|
}
|
|
|
|
/* Is it time to kill it? */
|
|
if (WaitExceedsMaxStandbyDelay())
|
|
{
|
|
pid_t pid;
|
|
|
|
/*
|
|
* Now find out who to throw out of the balloon.
|
|
*/
|
|
Assert(VirtualTransactionIdIsValid(*waitlist));
|
|
pid = CancelVirtualTransaction(*waitlist, reason);
|
|
|
|
/*
|
|
* Wait awhile for it to die so that we avoid flooding an
|
|
* unresponsive backend when system is heavily loaded.
|
|
*/
|
|
if (pid != 0)
|
|
pg_usleep(5000);
|
|
}
|
|
}
|
|
|
|
/* Reset ps display */
|
|
if (logged)
|
|
{
|
|
set_ps_display(oldactivitymsg, false);
|
|
pgstat_report_waiting(false);
|
|
}
|
|
|
|
/* The virtual transaction is gone now, wait for the next one */
|
|
waitlist++;
|
|
}
|
|
}
|
|
|
|
void
|
|
ResolveRecoveryConflictWithSnapshot(TransactionId latestRemovedXid, RelFileNode node)
|
|
{
|
|
VirtualTransactionId *backends;
|
|
|
|
backends = GetConflictingVirtualXIDs(latestRemovedXid,
|
|
node.dbNode);
|
|
|
|
ResolveRecoveryConflictWithVirtualXIDs(backends,
|
|
PROCSIG_RECOVERY_CONFLICT_SNAPSHOT);
|
|
}
|
|
|
|
void
|
|
ResolveRecoveryConflictWithTablespace(Oid tsid)
|
|
{
|
|
VirtualTransactionId *temp_file_users;
|
|
|
|
/*
|
|
* Standby users may be currently using this tablespace for for their
|
|
* temporary files. We only care about current users because
|
|
* temp_tablespace parameter will just ignore tablespaces that no longer
|
|
* exist.
|
|
*
|
|
* Ask everybody to cancel their queries immediately so we can ensure no
|
|
* temp files remain and we can remove the tablespace. Nuke the entire
|
|
* site from orbit, it's the only way to be sure.
|
|
*
|
|
* XXX: We could work out the pids of active backends using this
|
|
* tablespace by examining the temp filenames in the directory. We would
|
|
* then convert the pids into VirtualXIDs before attempting to cancel
|
|
* them.
|
|
*
|
|
* We don't wait for commit because drop tablespace is non-transactional.
|
|
*/
|
|
temp_file_users = GetConflictingVirtualXIDs(InvalidTransactionId,
|
|
InvalidOid);
|
|
ResolveRecoveryConflictWithVirtualXIDs(temp_file_users,
|
|
PROCSIG_RECOVERY_CONFLICT_TABLESPACE);
|
|
}
|
|
|
|
void
|
|
ResolveRecoveryConflictWithDatabase(Oid dbid)
|
|
{
|
|
/*
|
|
* We don't do ResolveRecoveryConflictWithVirutalXIDs() here since that
|
|
* only waits for transactions and completely idle sessions would block
|
|
* us. This is rare enough that we do this as simply as possible: no wait,
|
|
* just force them off immediately.
|
|
*
|
|
* No locking is required here because we already acquired
|
|
* AccessExclusiveLock. Anybody trying to connect while we do this will
|
|
* block during InitPostgres() and then disconnect when they see the
|
|
* database has been removed.
|
|
*/
|
|
while (CountDBBackends(dbid) > 0)
|
|
{
|
|
CancelDBBackends(dbid, PROCSIG_RECOVERY_CONFLICT_DATABASE, true);
|
|
|
|
/*
|
|
* Wait awhile for them to die so that we avoid flooding an
|
|
* unresponsive backend when system is heavily loaded.
|
|
*/
|
|
pg_usleep(10000);
|
|
}
|
|
}
|
|
|
|
static void
|
|
ResolveRecoveryConflictWithLock(Oid dbOid, Oid relOid)
|
|
{
|
|
VirtualTransactionId *backends;
|
|
bool report_memory_error = false;
|
|
bool lock_acquired = false;
|
|
int num_attempts = 0;
|
|
LOCKTAG locktag;
|
|
|
|
SET_LOCKTAG_RELATION(locktag, dbOid, relOid);
|
|
|
|
/*
|
|
* If blowing away everybody with conflicting locks doesn't work, after
|
|
* the first two attempts then we just start blowing everybody away until
|
|
* it does work. We do this because its likely that we either have too
|
|
* many locks and we just can't get one at all, or that there are many
|
|
* people crowding for the same table. Recovery must win; the end
|
|
* justifies the means.
|
|
*/
|
|
while (!lock_acquired)
|
|
{
|
|
if (++num_attempts < 3)
|
|
backends = GetLockConflicts(&locktag, AccessExclusiveLock);
|
|
else
|
|
{
|
|
backends = GetConflictingVirtualXIDs(InvalidTransactionId,
|
|
InvalidOid);
|
|
report_memory_error = true;
|
|
}
|
|
|
|
ResolveRecoveryConflictWithVirtualXIDs(backends,
|
|
PROCSIG_RECOVERY_CONFLICT_LOCK);
|
|
|
|
if (LockAcquireExtended(&locktag, AccessExclusiveLock, true, true, false)
|
|
!= LOCKACQUIRE_NOT_AVAIL)
|
|
lock_acquired = true;
|
|
}
|
|
}
|
|
|
|
/*
|
|
* ResolveRecoveryConflictWithBufferPin is called from LockBufferForCleanup()
|
|
* to resolve conflicts with other backends holding buffer pins.
|
|
*
|
|
* We either resolve conflicts immediately or set a SIGALRM to wake us at
|
|
* the limit of our patience. The sleep in LockBufferForCleanup() is
|
|
* performed here, for code clarity.
|
|
*
|
|
* Resolve conflict by sending a SIGUSR1 reason to all backends to check if
|
|
* they hold one of the buffer pins that is blocking Startup process. If so,
|
|
* backends will take an appropriate error action, ERROR or FATAL.
|
|
*
|
|
* We also check for deadlocks before we wait, though applications that cause
|
|
* these will be extremely rare. Deadlocks occur because if queries
|
|
* wait on a lock, that must be behind an AccessExclusiveLock, which can only
|
|
* be cleared if the Startup process replays a transaction completion record.
|
|
* If Startup process is also waiting then that is a deadlock. The deadlock
|
|
* can occur if the query is waiting and then the Startup sleeps, or if
|
|
* Startup is sleeping and the the query waits on a lock. We protect against
|
|
* only the former sequence here, the latter sequence is checked prior to
|
|
* the query sleeping, in CheckRecoveryConflictDeadlock().
|
|
*/
|
|
void
|
|
ResolveRecoveryConflictWithBufferPin(void)
|
|
{
|
|
bool sig_alarm_enabled = false;
|
|
|
|
Assert(InHotStandby);
|
|
|
|
if (MaxStandbyDelay == 0)
|
|
{
|
|
/*
|
|
* We don't want to wait, so just tell everybody holding the pin to
|
|
* get out of town.
|
|
*/
|
|
SendRecoveryConflictWithBufferPin(PROCSIG_RECOVERY_CONFLICT_BUFFERPIN);
|
|
}
|
|
else if (MaxStandbyDelay == -1)
|
|
{
|
|
/*
|
|
* Send out a request to check for buffer pin deadlocks before we
|
|
* wait. This is fairly cheap, so no need to wait for deadlock timeout
|
|
* before trying to send it out.
|
|
*/
|
|
SendRecoveryConflictWithBufferPin(PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK);
|
|
}
|
|
else
|
|
{
|
|
TimestampTz now;
|
|
long standby_delay_secs; /* How far Startup process is lagging */
|
|
int standby_delay_usecs;
|
|
|
|
now = GetCurrentTimestamp();
|
|
|
|
/* Are we past max_standby_delay? */
|
|
TimestampDifference(GetLatestXLogTime(), now,
|
|
&standby_delay_secs, &standby_delay_usecs);
|
|
|
|
if (standby_delay_secs >= MaxStandbyDelay)
|
|
{
|
|
/*
|
|
* We're already behind, so clear a path as quickly as possible.
|
|
*/
|
|
SendRecoveryConflictWithBufferPin(PROCSIG_RECOVERY_CONFLICT_BUFFERPIN);
|
|
}
|
|
else
|
|
{
|
|
TimestampTz fin_time; /* Expected wake-up time by timer */
|
|
long timer_delay_secs; /* Amount of time we set timer
|
|
* for */
|
|
int timer_delay_usecs = 0;
|
|
|
|
/*
|
|
* Send out a request to check for buffer pin deadlocks before we
|
|
* wait. This is fairly cheap, so no need to wait for deadlock
|
|
* timeout before trying to send it out.
|
|
*/
|
|
SendRecoveryConflictWithBufferPin(PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK);
|
|
|
|
/*
|
|
* How much longer we should wait?
|
|
*/
|
|
timer_delay_secs = MaxStandbyDelay - standby_delay_secs;
|
|
if (standby_delay_usecs > 0)
|
|
{
|
|
timer_delay_secs -= 1;
|
|
timer_delay_usecs = 1000000 - standby_delay_usecs;
|
|
}
|
|
|
|
/*
|
|
* It's possible that the difference is less than a microsecond;
|
|
* ensure we don't cancel, rather than set, the interrupt.
|
|
*/
|
|
if (timer_delay_secs == 0 && timer_delay_usecs == 0)
|
|
timer_delay_usecs = 1;
|
|
|
|
/*
|
|
* When is the finish time? We recheck this if we are woken early.
|
|
*/
|
|
fin_time = TimestampTzPlusMilliseconds(now,
|
|
(timer_delay_secs * 1000) +
|
|
(timer_delay_usecs / 1000));
|
|
|
|
if (enable_standby_sig_alarm(timer_delay_secs, timer_delay_usecs, fin_time))
|
|
sig_alarm_enabled = true;
|
|
else
|
|
elog(FATAL, "could not set timer for process wakeup");
|
|
}
|
|
}
|
|
|
|
/* Wait to be signaled by UnpinBuffer() */
|
|
ProcWaitForSignal();
|
|
|
|
if (sig_alarm_enabled)
|
|
{
|
|
if (!disable_standby_sig_alarm())
|
|
elog(FATAL, "could not disable timer for process wakeup");
|
|
}
|
|
}
|
|
|
|
void
|
|
SendRecoveryConflictWithBufferPin(ProcSignalReason reason)
|
|
{
|
|
Assert(reason == PROCSIG_RECOVERY_CONFLICT_BUFFERPIN ||
|
|
reason == PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK);
|
|
|
|
/*
|
|
* We send signal to all backends to ask them if they are holding the
|
|
* buffer pin which is delaying the Startup process. We must not set the
|
|
* conflict flag yet, since most backends will be innocent. Let the
|
|
* SIGUSR1 handling in each backend decide their own fate.
|
|
*/
|
|
CancelDBBackends(InvalidOid, reason, false);
|
|
}
|
|
|
|
/*
|
|
* In Hot Standby perform early deadlock detection. We abort the lock
|
|
* wait if are about to sleep while holding the buffer pin that Startup
|
|
* process is waiting for. The deadlock occurs because we can only be
|
|
* waiting behind an AccessExclusiveLock, which can only clear when a
|
|
* transaction completion record is replayed, which can only occur when
|
|
* Startup process is not waiting. So if Startup process is waiting we
|
|
* never will clear that lock, so if we wait we cause deadlock. If we
|
|
* are the Startup process then no need to check for deadlocks.
|
|
*/
|
|
void
|
|
CheckRecoveryConflictDeadlock(LWLockId partitionLock)
|
|
{
|
|
Assert(!InRecovery);
|
|
|
|
if (!HoldingBufferPinThatDelaysRecovery())
|
|
return;
|
|
|
|
LWLockRelease(partitionLock);
|
|
|
|
/*
|
|
* Error message should match ProcessInterrupts() but we avoid calling
|
|
* that because we aren't handling an interrupt at this point. Note that
|
|
* we only cancel the current transaction here, so if we are in a
|
|
* subtransaction and the pin is held by a parent, then the Startup
|
|
* process will continue to wait even though we have avoided deadlock.
|
|
*/
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_QUERY_CANCELED),
|
|
errmsg("canceling statement due to conflict with recovery"),
|
|
errdetail("User transaction caused buffer deadlock with recovery.")));
|
|
}
|
|
|
|
/*
|
|
* -----------------------------------------------------
|
|
* Locking in Recovery Mode
|
|
* -----------------------------------------------------
|
|
*
|
|
* All locks are held by the Startup process using a single virtual
|
|
* transaction. This implementation is both simpler and in some senses,
|
|
* more correct. The locks held mean "some original transaction held
|
|
* this lock, so query access is not allowed at this time". So the Startup
|
|
* process is the proxy by which the original locks are implemented.
|
|
*
|
|
* We only keep track of AccessExclusiveLocks, which are only ever held by
|
|
* one transaction on one relation, and don't worry about lock queuing.
|
|
*
|
|
* We keep a single dynamically expandible list of locks in local memory,
|
|
* RelationLockList, so we can keep track of the various entried made by
|
|
* the Startup process's virtual xid in the shared lock table.
|
|
*
|
|
* List elements use type xl_rel_lock, since the WAL record type exactly
|
|
* matches the information that we need to keep track of.
|
|
*
|
|
* We use session locks rather than normal locks so we don't need
|
|
* ResourceOwners.
|
|
*/
|
|
|
|
|
|
void
|
|
StandbyAcquireAccessExclusiveLock(TransactionId xid, Oid dbOid, Oid relOid)
|
|
{
|
|
xl_standby_lock *newlock;
|
|
LOCKTAG locktag;
|
|
|
|
/* Already processed? */
|
|
if (TransactionIdDidCommit(xid) || TransactionIdDidAbort(xid))
|
|
return;
|
|
|
|
elog(trace_recovery(DEBUG4),
|
|
"adding recovery lock: db %u rel %u", dbOid, relOid);
|
|
|
|
/* dbOid is InvalidOid when we are locking a shared relation. */
|
|
Assert(OidIsValid(relOid));
|
|
|
|
newlock = palloc(sizeof(xl_standby_lock));
|
|
newlock->xid = xid;
|
|
newlock->dbOid = dbOid;
|
|
newlock->relOid = relOid;
|
|
RecoveryLockList = lappend(RecoveryLockList, newlock);
|
|
|
|
/*
|
|
* Attempt to acquire the lock as requested, if not resolve conflict
|
|
*/
|
|
SET_LOCKTAG_RELATION(locktag, newlock->dbOid, newlock->relOid);
|
|
|
|
if (LockAcquireExtended(&locktag, AccessExclusiveLock, true, true, false)
|
|
== LOCKACQUIRE_NOT_AVAIL)
|
|
ResolveRecoveryConflictWithLock(newlock->dbOid, newlock->relOid);
|
|
}
|
|
|
|
static void
|
|
StandbyReleaseLocks(TransactionId xid)
|
|
{
|
|
ListCell *cell,
|
|
*prev,
|
|
*next;
|
|
|
|
/*
|
|
* Release all matching locks and remove them from list
|
|
*/
|
|
prev = NULL;
|
|
for (cell = list_head(RecoveryLockList); cell; cell = next)
|
|
{
|
|
xl_standby_lock *lock = (xl_standby_lock *) lfirst(cell);
|
|
|
|
next = lnext(cell);
|
|
|
|
if (!TransactionIdIsValid(xid) || lock->xid == xid)
|
|
{
|
|
LOCKTAG locktag;
|
|
|
|
elog(trace_recovery(DEBUG4),
|
|
"releasing recovery lock: xid %u db %u rel %u",
|
|
lock->xid, lock->dbOid, lock->relOid);
|
|
SET_LOCKTAG_RELATION(locktag, lock->dbOid, lock->relOid);
|
|
if (!LockRelease(&locktag, AccessExclusiveLock, true))
|
|
elog(trace_recovery(LOG),
|
|
"RecoveryLockList contains entry for lock no longer recorded by lock manager: xid %u database %u relation %u",
|
|
lock->xid, lock->dbOid, lock->relOid);
|
|
|
|
RecoveryLockList = list_delete_cell(RecoveryLockList, cell, prev);
|
|
pfree(lock);
|
|
}
|
|
else
|
|
prev = cell;
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Release locks for a transaction tree, starting at xid down, from
|
|
* RecoveryLockList.
|
|
*
|
|
* Called during WAL replay of COMMIT/ROLLBACK when in hot standby mode,
|
|
* to remove any AccessExclusiveLocks requested by a transaction.
|
|
*/
|
|
void
|
|
StandbyReleaseLockTree(TransactionId xid, int nsubxids, TransactionId *subxids)
|
|
{
|
|
int i;
|
|
|
|
StandbyReleaseLocks(xid);
|
|
|
|
for (i = 0; i < nsubxids; i++)
|
|
StandbyReleaseLocks(subxids[i]);
|
|
}
|
|
|
|
/*
|
|
* StandbyReleaseLocksMany
|
|
* Release standby locks held by XIDs < removeXid
|
|
*
|
|
* If keepPreparedXacts is true, keep prepared transactions even if
|
|
* they're older than removeXid
|
|
*/
|
|
static void
|
|
StandbyReleaseLocksMany(TransactionId removeXid, bool keepPreparedXacts)
|
|
{
|
|
ListCell *cell,
|
|
*prev,
|
|
*next;
|
|
LOCKTAG locktag;
|
|
|
|
/*
|
|
* Release all matching locks.
|
|
*/
|
|
prev = NULL;
|
|
for (cell = list_head(RecoveryLockList); cell; cell = next)
|
|
{
|
|
xl_standby_lock *lock = (xl_standby_lock *) lfirst(cell);
|
|
|
|
next = lnext(cell);
|
|
|
|
if (!TransactionIdIsValid(removeXid) || TransactionIdPrecedes(lock->xid, removeXid))
|
|
{
|
|
if (keepPreparedXacts && StandbyTransactionIdIsPrepared(lock->xid))
|
|
continue;
|
|
elog(trace_recovery(DEBUG4),
|
|
"releasing recovery lock: xid %u db %u rel %u",
|
|
lock->xid, lock->dbOid, lock->relOid);
|
|
SET_LOCKTAG_RELATION(locktag, lock->dbOid, lock->relOid);
|
|
if (!LockRelease(&locktag, AccessExclusiveLock, true))
|
|
elog(trace_recovery(LOG),
|
|
"RecoveryLockList contains entry for lock no longer recorded by lock manager: xid %u database %u relation %u",
|
|
lock->xid, lock->dbOid, lock->relOid);
|
|
RecoveryLockList = list_delete_cell(RecoveryLockList, cell, prev);
|
|
pfree(lock);
|
|
}
|
|
else
|
|
prev = cell;
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Called at end of recovery and when we see a shutdown checkpoint.
|
|
*/
|
|
void
|
|
StandbyReleaseAllLocks(void)
|
|
{
|
|
elog(trace_recovery(DEBUG2), "release all standby locks");
|
|
StandbyReleaseLocksMany(InvalidTransactionId, false);
|
|
}
|
|
|
|
/*
|
|
* StandbyReleaseOldLocks
|
|
* Release standby locks held by XIDs < removeXid, as long
|
|
* as their not prepared transactions.
|
|
*/
|
|
void
|
|
StandbyReleaseOldLocks(TransactionId removeXid)
|
|
{
|
|
StandbyReleaseLocksMany(removeXid, true);
|
|
}
|
|
|
|
/*
|
|
* --------------------------------------------------------------------
|
|
* Recovery handling for Rmgr RM_STANDBY_ID
|
|
*
|
|
* These record types will only be created if XLogStandbyInfoActive()
|
|
* --------------------------------------------------------------------
|
|
*/
|
|
|
|
void
|
|
standby_redo(XLogRecPtr lsn, XLogRecord *record)
|
|
{
|
|
uint8 info = record->xl_info & ~XLR_INFO_MASK;
|
|
|
|
/* Do nothing if we're not in standby mode */
|
|
if (standbyState == STANDBY_DISABLED)
|
|
return;
|
|
|
|
if (info == XLOG_STANDBY_LOCK)
|
|
{
|
|
xl_standby_locks *xlrec = (xl_standby_locks *) XLogRecGetData(record);
|
|
int i;
|
|
|
|
for (i = 0; i < xlrec->nlocks; i++)
|
|
StandbyAcquireAccessExclusiveLock(xlrec->locks[i].xid,
|
|
xlrec->locks[i].dbOid,
|
|
xlrec->locks[i].relOid);
|
|
}
|
|
else if (info == XLOG_RUNNING_XACTS)
|
|
{
|
|
xl_running_xacts *xlrec = (xl_running_xacts *) XLogRecGetData(record);
|
|
RunningTransactionsData running;
|
|
|
|
running.xcnt = xlrec->xcnt;
|
|
running.subxid_overflow = xlrec->subxid_overflow;
|
|
running.nextXid = xlrec->nextXid;
|
|
running.oldestRunningXid = xlrec->oldestRunningXid;
|
|
running.xids = xlrec->xids;
|
|
|
|
ProcArrayApplyRecoveryInfo(&running);
|
|
}
|
|
else
|
|
elog(PANIC, "relation_redo: unknown op code %u", info);
|
|
}
|
|
|
|
static void
|
|
standby_desc_running_xacts(StringInfo buf, xl_running_xacts *xlrec)
|
|
{
|
|
int i;
|
|
|
|
appendStringInfo(buf, " nextXid %u oldestRunningXid %u",
|
|
xlrec->nextXid,
|
|
xlrec->oldestRunningXid);
|
|
if (xlrec->xcnt > 0)
|
|
{
|
|
appendStringInfo(buf, "; %d xacts:", xlrec->xcnt);
|
|
for (i = 0; i < xlrec->xcnt; i++)
|
|
appendStringInfo(buf, " %u", xlrec->xids[i]);
|
|
}
|
|
|
|
if (xlrec->subxid_overflow)
|
|
appendStringInfo(buf, "; subxid ovf");
|
|
}
|
|
|
|
void
|
|
standby_desc(StringInfo buf, uint8 xl_info, char *rec)
|
|
{
|
|
uint8 info = xl_info & ~XLR_INFO_MASK;
|
|
|
|
if (info == XLOG_STANDBY_LOCK)
|
|
{
|
|
xl_standby_locks *xlrec = (xl_standby_locks *) rec;
|
|
int i;
|
|
|
|
appendStringInfo(buf, "AccessExclusive locks:");
|
|
|
|
for (i = 0; i < xlrec->nlocks; i++)
|
|
appendStringInfo(buf, " xid %u db %u rel %u",
|
|
xlrec->locks[i].xid, xlrec->locks[i].dbOid,
|
|
xlrec->locks[i].relOid);
|
|
}
|
|
else if (info == XLOG_RUNNING_XACTS)
|
|
{
|
|
xl_running_xacts *xlrec = (xl_running_xacts *) rec;
|
|
|
|
appendStringInfo(buf, " running xacts:");
|
|
standby_desc_running_xacts(buf, xlrec);
|
|
}
|
|
else
|
|
appendStringInfo(buf, "UNKNOWN");
|
|
}
|
|
|
|
/*
|
|
* Log details of the current snapshot to WAL. This allows the snapshot state
|
|
* to be reconstructed on the standby.
|
|
*/
|
|
void
|
|
LogStandbySnapshot(TransactionId *oldestActiveXid, TransactionId *nextXid)
|
|
{
|
|
RunningTransactions running;
|
|
xl_standby_lock *locks;
|
|
int nlocks;
|
|
|
|
Assert(XLogStandbyInfoActive());
|
|
|
|
/*
|
|
* Get details of any AccessExclusiveLocks being held at the moment.
|
|
*/
|
|
locks = GetRunningTransactionLocks(&nlocks);
|
|
if (nlocks > 0)
|
|
LogAccessExclusiveLocks(nlocks, locks);
|
|
|
|
/*
|
|
* Log details of all in-progress transactions. This should be the last
|
|
* record we write, because standby will open up when it sees this.
|
|
*/
|
|
running = GetRunningTransactionData();
|
|
LogCurrentRunningXacts(running);
|
|
|
|
*oldestActiveXid = running->oldestRunningXid;
|
|
*nextXid = running->nextXid;
|
|
}
|
|
|
|
/*
|
|
* Record an enhanced snapshot of running transactions into WAL.
|
|
*
|
|
* The definitions of RunningTransactionData and xl_xact_running_xacts
|
|
* are similar. We keep them separate because xl_xact_running_xacts
|
|
* is a contiguous chunk of memory and never exists fully until it is
|
|
* assembled in WAL.
|
|
*/
|
|
static void
|
|
LogCurrentRunningXacts(RunningTransactions CurrRunningXacts)
|
|
{
|
|
xl_running_xacts xlrec;
|
|
XLogRecData rdata[2];
|
|
int lastrdata = 0;
|
|
XLogRecPtr recptr;
|
|
|
|
xlrec.xcnt = CurrRunningXacts->xcnt;
|
|
xlrec.subxid_overflow = CurrRunningXacts->subxid_overflow;
|
|
xlrec.nextXid = CurrRunningXacts->nextXid;
|
|
xlrec.oldestRunningXid = CurrRunningXacts->oldestRunningXid;
|
|
|
|
/* Header */
|
|
rdata[0].data = (char *) (&xlrec);
|
|
rdata[0].len = MinSizeOfXactRunningXacts;
|
|
rdata[0].buffer = InvalidBuffer;
|
|
|
|
/* array of TransactionIds */
|
|
if (xlrec.xcnt > 0)
|
|
{
|
|
rdata[0].next = &(rdata[1]);
|
|
rdata[1].data = (char *) CurrRunningXacts->xids;
|
|
rdata[1].len = xlrec.xcnt * sizeof(TransactionId);
|
|
rdata[1].buffer = InvalidBuffer;
|
|
lastrdata = 1;
|
|
}
|
|
|
|
rdata[lastrdata].next = NULL;
|
|
|
|
recptr = XLogInsert(RM_STANDBY_ID, XLOG_RUNNING_XACTS, rdata);
|
|
|
|
if (CurrRunningXacts->subxid_overflow)
|
|
elog(trace_recovery(DEBUG2),
|
|
"snapshot of %u running transactions overflowed (lsn %X/%X oldest xid %u next xid %u)",
|
|
CurrRunningXacts->xcnt,
|
|
recptr.xlogid, recptr.xrecoff,
|
|
CurrRunningXacts->oldestRunningXid,
|
|
CurrRunningXacts->nextXid);
|
|
else
|
|
elog(trace_recovery(DEBUG2),
|
|
"snapshot of %u running transaction ids (lsn %X/%X oldest xid %u next xid %u)",
|
|
CurrRunningXacts->xcnt,
|
|
recptr.xlogid, recptr.xrecoff,
|
|
CurrRunningXacts->oldestRunningXid,
|
|
CurrRunningXacts->nextXid);
|
|
|
|
}
|
|
|
|
/*
|
|
* Wholesale logging of AccessExclusiveLocks. Other lock types need not be
|
|
* logged, as described in backend/storage/lmgr/README.
|
|
*/
|
|
static void
|
|
LogAccessExclusiveLocks(int nlocks, xl_standby_lock *locks)
|
|
{
|
|
XLogRecData rdata[2];
|
|
xl_standby_locks xlrec;
|
|
|
|
xlrec.nlocks = nlocks;
|
|
|
|
rdata[0].data = (char *) &xlrec;
|
|
rdata[0].len = offsetof(xl_standby_locks, locks);
|
|
rdata[0].buffer = InvalidBuffer;
|
|
rdata[0].next = &rdata[1];
|
|
|
|
rdata[1].data = (char *) locks;
|
|
rdata[1].len = nlocks * sizeof(xl_standby_lock);
|
|
rdata[1].buffer = InvalidBuffer;
|
|
rdata[1].next = NULL;
|
|
|
|
(void) XLogInsert(RM_STANDBY_ID, XLOG_STANDBY_LOCK, rdata);
|
|
}
|
|
|
|
/*
|
|
* Individual logging of AccessExclusiveLocks for use during LockAcquire()
|
|
*/
|
|
void
|
|
LogAccessExclusiveLock(Oid dbOid, Oid relOid)
|
|
{
|
|
xl_standby_lock xlrec;
|
|
|
|
/*
|
|
* Ensure that a TransactionId has been assigned to this transaction. We
|
|
* don't actually need the xid yet but if we don't do this then
|
|
* RecordTransactionCommit() and RecordTransactionAbort() will optimise
|
|
* away the transaction completion record which recovery relies upon to
|
|
* release locks. It's a hack, but for a corner case not worth adding code
|
|
* for into the main commit path.
|
|
*/
|
|
xlrec.xid = GetTopTransactionId();
|
|
|
|
/*
|
|
* Decode the locktag back to the original values, to avoid sending lots
|
|
* of empty bytes with every message. See lock.h to check how a locktag
|
|
* is defined for LOCKTAG_RELATION
|
|
*/
|
|
xlrec.dbOid = dbOid;
|
|
xlrec.relOid = relOid;
|
|
|
|
LogAccessExclusiveLocks(1, &xlrec);
|
|
}
|