mirror of
https://github.com/postgres/postgres.git
synced 2025-07-20 05:03:10 +03:00
Allow read only connections during recovery, known as Hot Standby.
Enabled by recovery_connections = on (default) and forcing archive recovery using a recovery.conf. Recovery processing now emulates the original transactions as they are replayed, providing full locking and MVCC behaviour for read only queries. Recovery must enter consistent state before connections are allowed, so there is a delay, typically short, before connections succeed. Replay of recovering transactions can conflict and in some cases deadlock with queries during recovery; these result in query cancellation after max_standby_delay seconds have expired. Infrastructure changes have minor effects on normal running, though introduce four new types of WAL record. New test mode "make standbycheck" allows regression tests of static command behaviour on a standby server while in recovery. Typical and extreme dynamic behaviours have been checked via code inspection and manual testing. Few port specific behaviours have been utilised, though primary testing has been on Linux only so far. This commit is the basic patch. Additional changes will follow in this release to enhance some aspects of behaviour, notably improved handling of conflicts, deadlock detection and query cancellation. Changes to VACUUM FULL are also required. Simon Riggs, with significant and lengthy review by Heikki Linnakangas, including streamlined redesign of snapshot creation and two-phase commit. Important contributions from Florian Pflug, Mark Kirkwood, Merlin Moncure, Greg Stark, Gianni Ciolli, Gabriele Bartolini, Hannu Krosing, Robert Haas, Tatsuo Ishii, Hiroyuki Yamada plus support and feedback from many other community members.
This commit is contained in:
@ -8,7 +8,7 @@
|
||||
*
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* $PostgreSQL: pgsql/src/backend/access/heap/heapam.c,v 1.278 2009/08/24 02:18:31 tgl Exp $
|
||||
* $PostgreSQL: pgsql/src/backend/access/heap/heapam.c,v 1.279 2009/12/19 01:32:32 sriggs Exp $
|
||||
*
|
||||
*
|
||||
* INTERFACE ROUTINES
|
||||
@ -59,6 +59,7 @@
|
||||
#include "storage/lmgr.h"
|
||||
#include "storage/procarray.h"
|
||||
#include "storage/smgr.h"
|
||||
#include "storage/standby.h"
|
||||
#include "utils/datum.h"
|
||||
#include "utils/inval.h"
|
||||
#include "utils/lsyscache.h"
|
||||
@ -248,8 +249,11 @@ heapgetpage(HeapScanDesc scan, BlockNumber page)
|
||||
/*
|
||||
* If the all-visible flag indicates that all tuples on the page are
|
||||
* visible to everyone, we can skip the per-tuple visibility tests.
|
||||
* But not in hot standby mode. A tuple that's already visible to all
|
||||
* transactions in the master might still be invisible to a read-only
|
||||
* transaction in the standby.
|
||||
*/
|
||||
all_visible = PageIsAllVisible(dp);
|
||||
all_visible = PageIsAllVisible(dp) && !snapshot->takenDuringRecovery;
|
||||
|
||||
for (lineoff = FirstOffsetNumber, lpp = PageGetItemId(dp, lineoff);
|
||||
lineoff <= lines;
|
||||
@ -3769,6 +3773,60 @@ heap_restrpos(HeapScanDesc scan)
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* If 'tuple' contains any XID greater than latestRemovedXid, update
|
||||
* latestRemovedXid to the greatest one found.
|
||||
*/
|
||||
void
|
||||
HeapTupleHeaderAdvanceLatestRemovedXid(HeapTupleHeader tuple,
|
||||
TransactionId *latestRemovedXid)
|
||||
{
|
||||
TransactionId xmin = HeapTupleHeaderGetXmin(tuple);
|
||||
TransactionId xmax = HeapTupleHeaderGetXmax(tuple);
|
||||
TransactionId xvac = HeapTupleHeaderGetXvac(tuple);
|
||||
|
||||
if (tuple->t_infomask & HEAP_MOVED_OFF ||
|
||||
tuple->t_infomask & HEAP_MOVED_IN)
|
||||
{
|
||||
if (TransactionIdPrecedes(*latestRemovedXid, xvac))
|
||||
*latestRemovedXid = xvac;
|
||||
}
|
||||
|
||||
if (TransactionIdPrecedes(*latestRemovedXid, xmax))
|
||||
*latestRemovedXid = xmax;
|
||||
|
||||
if (TransactionIdPrecedes(*latestRemovedXid, xmin))
|
||||
*latestRemovedXid = xmin;
|
||||
|
||||
Assert(TransactionIdIsValid(*latestRemovedXid));
|
||||
}
|
||||
|
||||
/*
|
||||
* Perform XLogInsert to register a heap cleanup info message. These
|
||||
* messages are sent once per VACUUM and are required because
|
||||
* of the phasing of removal operations during a lazy VACUUM.
|
||||
* see comments for vacuum_log_cleanup_info().
|
||||
*/
|
||||
XLogRecPtr
|
||||
log_heap_cleanup_info(RelFileNode rnode, TransactionId latestRemovedXid)
|
||||
{
|
||||
xl_heap_cleanup_info xlrec;
|
||||
XLogRecPtr recptr;
|
||||
XLogRecData rdata;
|
||||
|
||||
xlrec.node = rnode;
|
||||
xlrec.latestRemovedXid = latestRemovedXid;
|
||||
|
||||
rdata.data = (char *) &xlrec;
|
||||
rdata.len = SizeOfHeapCleanupInfo;
|
||||
rdata.buffer = InvalidBuffer;
|
||||
rdata.next = NULL;
|
||||
|
||||
recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_CLEANUP_INFO, &rdata);
|
||||
|
||||
return recptr;
|
||||
}
|
||||
|
||||
/*
|
||||
* Perform XLogInsert for a heap-clean operation. Caller must already
|
||||
* have modified the buffer and marked it dirty.
|
||||
@ -3776,13 +3834,17 @@ heap_restrpos(HeapScanDesc scan)
|
||||
* Note: prior to Postgres 8.3, the entries in the nowunused[] array were
|
||||
* zero-based tuple indexes. Now they are one-based like other uses
|
||||
* of OffsetNumber.
|
||||
*
|
||||
* We also include latestRemovedXid, which is the greatest XID present in
|
||||
* the removed tuples. That allows recovery processing to cancel or wait
|
||||
* for long standby queries that can still see these tuples.
|
||||
*/
|
||||
XLogRecPtr
|
||||
log_heap_clean(Relation reln, Buffer buffer,
|
||||
OffsetNumber *redirected, int nredirected,
|
||||
OffsetNumber *nowdead, int ndead,
|
||||
OffsetNumber *nowunused, int nunused,
|
||||
bool redirect_move)
|
||||
TransactionId latestRemovedXid, bool redirect_move)
|
||||
{
|
||||
xl_heap_clean xlrec;
|
||||
uint8 info;
|
||||
@ -3794,6 +3856,7 @@ log_heap_clean(Relation reln, Buffer buffer,
|
||||
|
||||
xlrec.node = reln->rd_node;
|
||||
xlrec.block = BufferGetBlockNumber(buffer);
|
||||
xlrec.latestRemovedXid = latestRemovedXid;
|
||||
xlrec.nredirected = nredirected;
|
||||
xlrec.ndead = ndead;
|
||||
|
||||
@ -4067,6 +4130,33 @@ log_newpage(RelFileNode *rnode, ForkNumber forkNum, BlockNumber blkno,
|
||||
return recptr;
|
||||
}
|
||||
|
||||
/*
|
||||
* Handles CLEANUP_INFO
|
||||
*/
|
||||
static void
|
||||
heap_xlog_cleanup_info(XLogRecPtr lsn, XLogRecord *record)
|
||||
{
|
||||
xl_heap_cleanup_info *xlrec = (xl_heap_cleanup_info *) XLogRecGetData(record);
|
||||
|
||||
if (InHotStandby)
|
||||
{
|
||||
VirtualTransactionId *backends;
|
||||
|
||||
backends = GetConflictingVirtualXIDs(xlrec->latestRemovedXid,
|
||||
InvalidOid,
|
||||
true);
|
||||
ResolveRecoveryConflictWithVirtualXIDs(backends,
|
||||
"VACUUM index cleanup",
|
||||
CONFLICT_MODE_ERROR);
|
||||
}
|
||||
|
||||
/*
|
||||
* Actual operation is a no-op. Record type exists to provide a means
|
||||
* for conflict processing to occur before we begin index vacuum actions.
|
||||
* see vacuumlazy.c and also comments in btvacuumpage()
|
||||
*/
|
||||
}
|
||||
|
||||
/*
|
||||
* Handles CLEAN and CLEAN_MOVE record types
|
||||
*/
|
||||
@ -4085,12 +4175,31 @@ heap_xlog_clean(XLogRecPtr lsn, XLogRecord *record, bool clean_move)
|
||||
int nunused;
|
||||
Size freespace;
|
||||
|
||||
/*
|
||||
* We're about to remove tuples. In Hot Standby mode, ensure that there's
|
||||
* no queries running for which the removed tuples are still visible.
|
||||
*/
|
||||
if (InHotStandby)
|
||||
{
|
||||
VirtualTransactionId *backends;
|
||||
|
||||
backends = GetConflictingVirtualXIDs(xlrec->latestRemovedXid,
|
||||
InvalidOid,
|
||||
true);
|
||||
ResolveRecoveryConflictWithVirtualXIDs(backends,
|
||||
"VACUUM heap cleanup",
|
||||
CONFLICT_MODE_ERROR);
|
||||
}
|
||||
|
||||
RestoreBkpBlocks(lsn, record, true);
|
||||
|
||||
if (record->xl_info & XLR_BKP_BLOCK_1)
|
||||
return;
|
||||
|
||||
buffer = XLogReadBuffer(xlrec->node, xlrec->block, false);
|
||||
buffer = XLogReadBufferExtended(xlrec->node, MAIN_FORKNUM, xlrec->block, RBM_NORMAL);
|
||||
if (!BufferIsValid(buffer))
|
||||
return;
|
||||
LockBufferForCleanup(buffer);
|
||||
page = (Page) BufferGetPage(buffer);
|
||||
|
||||
if (XLByteLE(lsn, PageGetLSN(page)))
|
||||
@ -4145,12 +4254,40 @@ heap_xlog_freeze(XLogRecPtr lsn, XLogRecord *record)
|
||||
Buffer buffer;
|
||||
Page page;
|
||||
|
||||
/*
|
||||
* In Hot Standby mode, ensure that there's no queries running which still
|
||||
* consider the frozen xids as running.
|
||||
*/
|
||||
if (InHotStandby)
|
||||
{
|
||||
VirtualTransactionId *backends;
|
||||
|
||||
/*
|
||||
* XXX: Using cutoff_xid is overly conservative. Even if cutoff_xid
|
||||
* is recent enough to conflict with a backend, the actual values
|
||||
* being frozen might not be. With a typical vacuum_freeze_min_age
|
||||
* setting in the ballpark of millions of transactions, it won't make
|
||||
* a difference, but it might if you run a manual VACUUM FREEZE.
|
||||
* Typically the cutoff is much earlier than any recently deceased
|
||||
* tuple versions removed by this vacuum, so don't worry too much.
|
||||
*/
|
||||
backends = GetConflictingVirtualXIDs(cutoff_xid,
|
||||
InvalidOid,
|
||||
true);
|
||||
ResolveRecoveryConflictWithVirtualXIDs(backends,
|
||||
"VACUUM heap freeze",
|
||||
CONFLICT_MODE_ERROR);
|
||||
}
|
||||
|
||||
RestoreBkpBlocks(lsn, record, false);
|
||||
|
||||
if (record->xl_info & XLR_BKP_BLOCK_1)
|
||||
return;
|
||||
|
||||
buffer = XLogReadBuffer(xlrec->node, xlrec->block, false);
|
||||
buffer = XLogReadBufferExtended(xlrec->node, MAIN_FORKNUM, xlrec->block, RBM_NORMAL);
|
||||
if (!BufferIsValid(buffer))
|
||||
return;
|
||||
LockBufferForCleanup(buffer);
|
||||
page = (Page) BufferGetPage(buffer);
|
||||
|
||||
if (XLByteLE(lsn, PageGetLSN(page)))
|
||||
@ -4740,6 +4877,11 @@ heap_redo(XLogRecPtr lsn, XLogRecord *record)
|
||||
{
|
||||
uint8 info = record->xl_info & ~XLR_INFO_MASK;
|
||||
|
||||
/*
|
||||
* These operations don't overwrite MVCC data so no conflict
|
||||
* processing is required. The ones in heap2 rmgr do.
|
||||
*/
|
||||
|
||||
RestoreBkpBlocks(lsn, record, false);
|
||||
|
||||
switch (info & XLOG_HEAP_OPMASK)
|
||||
@ -4778,20 +4920,25 @@ heap2_redo(XLogRecPtr lsn, XLogRecord *record)
|
||||
{
|
||||
uint8 info = record->xl_info & ~XLR_INFO_MASK;
|
||||
|
||||
/*
|
||||
* Note that RestoreBkpBlocks() is called after conflict processing
|
||||
* within each record type handling function.
|
||||
*/
|
||||
|
||||
switch (info & XLOG_HEAP_OPMASK)
|
||||
{
|
||||
case XLOG_HEAP2_FREEZE:
|
||||
RestoreBkpBlocks(lsn, record, false);
|
||||
heap_xlog_freeze(lsn, record);
|
||||
break;
|
||||
case XLOG_HEAP2_CLEAN:
|
||||
RestoreBkpBlocks(lsn, record, true);
|
||||
heap_xlog_clean(lsn, record, false);
|
||||
break;
|
||||
case XLOG_HEAP2_CLEAN_MOVE:
|
||||
RestoreBkpBlocks(lsn, record, true);
|
||||
heap_xlog_clean(lsn, record, true);
|
||||
break;
|
||||
case XLOG_HEAP2_CLEANUP_INFO:
|
||||
heap_xlog_cleanup_info(lsn, record);
|
||||
break;
|
||||
default:
|
||||
elog(PANIC, "heap2_redo: unknown op code %u", info);
|
||||
}
|
||||
@ -4921,17 +5068,26 @@ heap2_desc(StringInfo buf, uint8 xl_info, char *rec)
|
||||
{
|
||||
xl_heap_clean *xlrec = (xl_heap_clean *) rec;
|
||||
|
||||
appendStringInfo(buf, "clean: rel %u/%u/%u; blk %u",
|
||||
appendStringInfo(buf, "clean: rel %u/%u/%u; blk %u remxid %u",
|
||||
xlrec->node.spcNode, xlrec->node.dbNode,
|
||||
xlrec->node.relNode, xlrec->block);
|
||||
xlrec->node.relNode, xlrec->block,
|
||||
xlrec->latestRemovedXid);
|
||||
}
|
||||
else if (info == XLOG_HEAP2_CLEAN_MOVE)
|
||||
{
|
||||
xl_heap_clean *xlrec = (xl_heap_clean *) rec;
|
||||
|
||||
appendStringInfo(buf, "clean_move: rel %u/%u/%u; blk %u",
|
||||
appendStringInfo(buf, "clean_move: rel %u/%u/%u; blk %u remxid %u",
|
||||
xlrec->node.spcNode, xlrec->node.dbNode,
|
||||
xlrec->node.relNode, xlrec->block);
|
||||
xlrec->node.relNode, xlrec->block,
|
||||
xlrec->latestRemovedXid);
|
||||
}
|
||||
else if (info == XLOG_HEAP2_CLEANUP_INFO)
|
||||
{
|
||||
xl_heap_cleanup_info *xlrec = (xl_heap_cleanup_info *) rec;
|
||||
|
||||
appendStringInfo(buf, "cleanup info: remxid %u",
|
||||
xlrec->latestRemovedXid);
|
||||
}
|
||||
else
|
||||
appendStringInfo(buf, "UNKNOWN");
|
||||
|
@ -8,7 +8,7 @@
|
||||
*
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* $PostgreSQL: pgsql/src/backend/access/heap/pruneheap.c,v 1.18 2009/06/11 14:48:53 momjian Exp $
|
||||
* $PostgreSQL: pgsql/src/backend/access/heap/pruneheap.c,v 1.19 2009/12/19 01:32:32 sriggs Exp $
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
@ -30,7 +30,8 @@
|
||||
typedef struct
|
||||
{
|
||||
TransactionId new_prune_xid; /* new prune hint value for page */
|
||||
int nredirected; /* numbers of entries in arrays below */
|
||||
TransactionId latestRemovedXid; /* latest xid to be removed by this prune */
|
||||
int nredirected; /* numbers of entries in arrays below */
|
||||
int ndead;
|
||||
int nunused;
|
||||
/* arrays that accumulate indexes of items to be changed */
|
||||
@ -84,6 +85,14 @@ heap_page_prune_opt(Relation relation, Buffer buffer, TransactionId OldestXmin)
|
||||
if (!PageIsPrunable(page, OldestXmin))
|
||||
return;
|
||||
|
||||
/*
|
||||
* We can't write WAL in recovery mode, so there's no point trying to
|
||||
* clean the page. The master will likely issue a cleaning WAL record
|
||||
* soon anyway, so this is no particular loss.
|
||||
*/
|
||||
if (RecoveryInProgress())
|
||||
return;
|
||||
|
||||
/*
|
||||
* We prune when a previous UPDATE failed to find enough space on the page
|
||||
* for a new tuple version, or when free space falls below the relation's
|
||||
@ -176,6 +185,7 @@ heap_page_prune(Relation relation, Buffer buffer, TransactionId OldestXmin,
|
||||
* of our working state.
|
||||
*/
|
||||
prstate.new_prune_xid = InvalidTransactionId;
|
||||
prstate.latestRemovedXid = InvalidTransactionId;
|
||||
prstate.nredirected = prstate.ndead = prstate.nunused = 0;
|
||||
memset(prstate.marked, 0, sizeof(prstate.marked));
|
||||
|
||||
@ -257,7 +267,7 @@ heap_page_prune(Relation relation, Buffer buffer, TransactionId OldestXmin,
|
||||
prstate.redirected, prstate.nredirected,
|
||||
prstate.nowdead, prstate.ndead,
|
||||
prstate.nowunused, prstate.nunused,
|
||||
redirect_move);
|
||||
prstate.latestRemovedXid, redirect_move);
|
||||
|
||||
PageSetLSN(BufferGetPage(buffer), recptr);
|
||||
PageSetTLI(BufferGetPage(buffer), ThisTimeLineID);
|
||||
@ -395,6 +405,8 @@ heap_prune_chain(Relation relation, Buffer buffer, OffsetNumber rootoffnum,
|
||||
== HEAPTUPLE_DEAD && !HeapTupleHeaderIsHotUpdated(htup))
|
||||
{
|
||||
heap_prune_record_unused(prstate, rootoffnum);
|
||||
HeapTupleHeaderAdvanceLatestRemovedXid(htup,
|
||||
&prstate->latestRemovedXid);
|
||||
ndeleted++;
|
||||
}
|
||||
|
||||
@ -520,7 +532,11 @@ heap_prune_chain(Relation relation, Buffer buffer, OffsetNumber rootoffnum,
|
||||
* find another DEAD tuple is a fairly unusual corner case.)
|
||||
*/
|
||||
if (tupdead)
|
||||
{
|
||||
latestdead = offnum;
|
||||
HeapTupleHeaderAdvanceLatestRemovedXid(htup,
|
||||
&prstate->latestRemovedXid);
|
||||
}
|
||||
else if (!recent_dead)
|
||||
break;
|
||||
|
||||
|
Reference in New Issue
Block a user