1
0
mirror of https://github.com/postgres/postgres.git synced 2025-08-24 09:27:52 +03:00

Allow read only connections during recovery, known as Hot Standby.

Enabled by recovery_connections = on (default) and forcing archive recovery using a recovery.conf. Recovery processing now emulates the original transactions as they are replayed, providing full locking and MVCC behaviour for read only queries. Recovery must enter consistent state before connections are allowed, so there is a delay, typically short, before connections succeed. Replay of recovering transactions can conflict and in some cases deadlock with queries during recovery; these result in query cancellation after max_standby_delay seconds have expired. Infrastructure changes have minor effects on normal running, though introduce four new types of WAL record.

New test mode "make standbycheck" allows regression tests of static command behaviour on a standby server while in recovery. Typical and extreme dynamic behaviours have been checked via code inspection and manual testing. Few port specific behaviours have been utilised, though primary testing has been on Linux only so far.

This commit is the basic patch. Additional changes will follow in this release to enhance some aspects of behaviour, notably improved handling of conflicts, deadlock detection and query cancellation. Changes to VACUUM FULL are also required.

Simon Riggs, with significant and lengthy review by Heikki Linnakangas, including streamlined redesign of snapshot creation and two-phase commit.

Important contributions from Florian Pflug, Mark Kirkwood, Merlin Moncure, Greg Stark, Gianni Ciolli, Gabriele Bartolini, Hannu Krosing, Robert Haas, Tatsuo Ishii, Hiroyuki Yamada plus support and feedback from many other community members.
This commit is contained in:
Simon Riggs
2009-12-19 01:32:45 +00:00
parent 78a09145e0
commit efc16ea520
87 changed files with 6165 additions and 428 deletions

View File

@@ -13,7 +13,7 @@
*
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/commands/dbcommands.c,v 1.228 2009/11/12 02:46:16 tgl Exp $
* $PostgreSQL: pgsql/src/backend/commands/dbcommands.c,v 1.229 2009/12/19 01:32:34 sriggs Exp $
*
*-------------------------------------------------------------------------
*/
@@ -26,6 +26,7 @@
#include "access/genam.h"
#include "access/heapam.h"
#include "access/transam.h"
#include "access/xact.h"
#include "access/xlogutils.h"
#include "catalog/catalog.h"
@@ -48,6 +49,7 @@
#include "storage/ipc.h"
#include "storage/procarray.h"
#include "storage/smgr.h"
#include "storage/standby.h"
#include "utils/acl.h"
#include "utils/builtins.h"
#include "utils/fmgroids.h"
@@ -1941,6 +1943,26 @@ dbase_redo(XLogRecPtr lsn, XLogRecord *record)
dst_path = GetDatabasePath(xlrec->db_id, xlrec->tablespace_id);
if (InHotStandby)
{
VirtualTransactionId *database_users;
/*
* Find all users connected to this database and ask them
* politely to immediately kill their sessions before processing
* the drop database record, after the usual grace period.
* We don't wait for commit because drop database is
* non-transactional.
*/
database_users = GetConflictingVirtualXIDs(InvalidTransactionId,
xlrec->db_id,
false);
ResolveRecoveryConflictWithVirtualXIDs(database_users,
"drop database",
CONFLICT_MODE_FATAL);
}
/* Drop pages for this database that are in the shared buffer cache */
DropDatabaseBuffers(xlrec->db_id);

View File

@@ -8,7 +8,7 @@
*
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/commands/lockcmds.c,v 1.25 2009/06/11 14:48:56 momjian Exp $
* $PostgreSQL: pgsql/src/backend/commands/lockcmds.c,v 1.26 2009/12/19 01:32:34 sriggs Exp $
*
*-------------------------------------------------------------------------
*/
@@ -47,6 +47,16 @@ LockTableCommand(LockStmt *lockstmt)
reloid = RangeVarGetRelid(relation, false);
/*
* During recovery we only accept these variations:
* LOCK TABLE foo IN ACCESS SHARE MODE
* LOCK TABLE foo IN ROW SHARE MODE
* LOCK TABLE foo IN ROW EXCLUSIVE MODE
* This test must match the restrictions defined in LockAcquire()
*/
if (lockstmt->mode > RowExclusiveLock)
PreventCommandDuringRecovery();
LockTableRecurse(reloid, relation,
lockstmt->mode, lockstmt->nowait, recurse);
}

View File

@@ -8,7 +8,7 @@
*
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/commands/sequence.c,v 1.162 2009/10/13 00:53:07 tgl Exp $
* $PostgreSQL: pgsql/src/backend/commands/sequence.c,v 1.163 2009/12/19 01:32:34 sriggs Exp $
*
*-------------------------------------------------------------------------
*/
@@ -458,6 +458,9 @@ nextval_internal(Oid relid)
rescnt = 0;
bool logit = false;
/* nextval() writes to database and must be prevented during recovery */
PreventCommandDuringRecovery();
/* open and AccessShareLock sequence */
init_sequence(relid, &elm, &seqrel);

View File

@@ -37,7 +37,7 @@
*
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/commands/tablespace.c,v 1.63 2009/11/10 18:53:38 tgl Exp $
* $PostgreSQL: pgsql/src/backend/commands/tablespace.c,v 1.64 2009/12/19 01:32:34 sriggs Exp $
*
*-------------------------------------------------------------------------
*/
@@ -50,6 +50,7 @@
#include "access/heapam.h"
#include "access/sysattr.h"
#include "access/transam.h"
#include "access/xact.h"
#include "catalog/catalog.h"
#include "catalog/dependency.h"
@@ -60,6 +61,8 @@
#include "miscadmin.h"
#include "postmaster/bgwriter.h"
#include "storage/fd.h"
#include "storage/procarray.h"
#include "storage/standby.h"
#include "utils/acl.h"
#include "utils/builtins.h"
#include "utils/fmgroids.h"
@@ -1317,11 +1320,58 @@ tblspc_redo(XLogRecPtr lsn, XLogRecord *record)
{
xl_tblspc_drop_rec *xlrec = (xl_tblspc_drop_rec *) XLogRecGetData(record);
/*
* If we issued a WAL record for a drop tablespace it is
* because there were no files in it at all. That means that
* no permanent objects can exist in it at this point.
*
* It is possible for standby users to be using this tablespace
* as a location for their temporary files, so if we fail to
* remove all files then do conflict processing and try again,
* if currently enabled.
*/
if (!remove_tablespace_directories(xlrec->ts_id, true))
ereport(ERROR,
{
VirtualTransactionId *temp_file_users;
/*
* Standby users may be currently using this tablespace for
* for their temporary files. We only care about current
* users because temp_tablespace parameter will just ignore
* tablespaces that no longer exist.
*
* Ask everybody to cancel their queries immediately so
* we can ensure no temp files remain and we can remove the
* tablespace. Nuke the entire site from orbit, it's the only
* way to be sure.
*
* XXX: We could work out the pids of active backends
* using this tablespace by examining the temp filenames in the
* directory. We would then convert the pids into VirtualXIDs
* before attempting to cancel them.
*
* We don't wait for commit because drop tablespace is
* non-transactional.
*/
temp_file_users = GetConflictingVirtualXIDs(InvalidTransactionId,
InvalidOid,
false);
ResolveRecoveryConflictWithVirtualXIDs(temp_file_users,
"drop tablespace",
CONFLICT_MODE_ERROR);
/*
* If we did recovery processing then hopefully the
* backends who wrote temp files should have cleaned up and
* exited by now. So lets recheck before we throw an error.
* If !process_conflicts then this will just fail again.
*/
if (!remove_tablespace_directories(xlrec->ts_id, true))
ereport(ERROR,
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
errmsg("tablespace %u is not empty",
xlrec->ts_id)));
}
}
else
elog(PANIC, "tblspc_redo: unknown op code %u", info);

View File

@@ -13,7 +13,7 @@
*
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/commands/vacuum.c,v 1.398 2009/12/09 21:57:51 tgl Exp $
* $PostgreSQL: pgsql/src/backend/commands/vacuum.c,v 1.399 2009/12/19 01:32:34 sriggs Exp $
*
*-------------------------------------------------------------------------
*/
@@ -141,6 +141,7 @@ typedef struct VRelStats
/* vtlinks array for tuple chain following - sorted by new_tid */
int num_vtlinks;
VTupleLink vtlinks;
TransactionId latestRemovedXid;
} VRelStats;
/*----------------------------------------------------------------------
@@ -224,7 +225,7 @@ static void scan_heap(VRelStats *vacrelstats, Relation onerel,
static bool repair_frag(VRelStats *vacrelstats, Relation onerel,
VacPageList vacuum_pages, VacPageList fraged_pages,
int nindexes, Relation *Irel);
static void move_chain_tuple(Relation rel,
static void move_chain_tuple(VRelStats *vacrelstats, Relation rel,
Buffer old_buf, Page old_page, HeapTuple old_tup,
Buffer dst_buf, Page dst_page, VacPage dst_vacpage,
ExecContext ec, ItemPointer ctid, bool cleanVpd);
@@ -237,7 +238,7 @@ static void update_hint_bits(Relation rel, VacPageList fraged_pages,
int num_moved);
static void vacuum_heap(VRelStats *vacrelstats, Relation onerel,
VacPageList vacpagelist);
static void vacuum_page(Relation onerel, Buffer buffer, VacPage vacpage);
static void vacuum_page(VRelStats *vacrelstats, Relation onerel, Buffer buffer, VacPage vacpage);
static void vacuum_index(VacPageList vacpagelist, Relation indrel,
double num_tuples, int keep_tuples);
static void scan_index(Relation indrel, double num_tuples);
@@ -1300,6 +1301,7 @@ full_vacuum_rel(Relation onerel, VacuumStmt *vacstmt)
vacrelstats->rel_tuples = 0;
vacrelstats->rel_indexed_tuples = 0;
vacrelstats->hasindex = false;
vacrelstats->latestRemovedXid = InvalidTransactionId;
/* scan the heap */
vacuum_pages.num_pages = fraged_pages.num_pages = 0;
@@ -1708,6 +1710,9 @@ scan_heap(VRelStats *vacrelstats, Relation onerel,
{
ItemId lpp;
HeapTupleHeaderAdvanceLatestRemovedXid(tuple.t_data,
&vacrelstats->latestRemovedXid);
/*
* Here we are building a temporary copy of the page with dead
* tuples removed. Below we will apply
@@ -2025,7 +2030,7 @@ repair_frag(VRelStats *vacrelstats, Relation onerel,
/* there are dead tuples on this page - clean them */
Assert(!isempty);
LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
vacuum_page(onerel, buf, last_vacuum_page);
vacuum_page(vacrelstats, onerel, buf, last_vacuum_page);
LockBuffer(buf, BUFFER_LOCK_UNLOCK);
}
else
@@ -2514,7 +2519,7 @@ repair_frag(VRelStats *vacrelstats, Relation onerel,
tuple.t_data = (HeapTupleHeader) PageGetItem(Cpage, Citemid);
tuple_len = tuple.t_len = ItemIdGetLength(Citemid);
move_chain_tuple(onerel, Cbuf, Cpage, &tuple,
move_chain_tuple(vacrelstats, onerel, Cbuf, Cpage, &tuple,
dst_buffer, dst_page, destvacpage,
&ec, &Ctid, vtmove[ti].cleanVpd);
@@ -2600,7 +2605,7 @@ repair_frag(VRelStats *vacrelstats, Relation onerel,
dst_page = BufferGetPage(dst_buffer);
/* if this page was not used before - clean it */
if (!PageIsEmpty(dst_page) && dst_vacpage->offsets_used == 0)
vacuum_page(onerel, dst_buffer, dst_vacpage);
vacuum_page(vacrelstats, onerel, dst_buffer, dst_vacpage);
}
else
LockBuffer(dst_buffer, BUFFER_LOCK_EXCLUSIVE);
@@ -2753,7 +2758,7 @@ repair_frag(VRelStats *vacrelstats, Relation onerel,
HOLD_INTERRUPTS();
heldoff = true;
ForceSyncCommit();
(void) RecordTransactionCommit();
(void) RecordTransactionCommit(true);
}
/*
@@ -2781,7 +2786,7 @@ repair_frag(VRelStats *vacrelstats, Relation onerel,
LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
page = BufferGetPage(buf);
if (!PageIsEmpty(page))
vacuum_page(onerel, buf, *curpage);
vacuum_page(vacrelstats, onerel, buf, *curpage);
UnlockReleaseBuffer(buf);
}
}
@@ -2917,7 +2922,7 @@ repair_frag(VRelStats *vacrelstats, Relation onerel,
recptr = log_heap_clean(onerel, buf,
NULL, 0, NULL, 0,
unused, uncnt,
false);
vacrelstats->latestRemovedXid, false);
PageSetLSN(page, recptr);
PageSetTLI(page, ThisTimeLineID);
}
@@ -2969,7 +2974,7 @@ repair_frag(VRelStats *vacrelstats, Relation onerel,
* already too long and almost unreadable.
*/
static void
move_chain_tuple(Relation rel,
move_chain_tuple(VRelStats *vacrelstats, Relation rel,
Buffer old_buf, Page old_page, HeapTuple old_tup,
Buffer dst_buf, Page dst_page, VacPage dst_vacpage,
ExecContext ec, ItemPointer ctid, bool cleanVpd)
@@ -3027,7 +3032,7 @@ move_chain_tuple(Relation rel,
int sv_offsets_used = dst_vacpage->offsets_used;
dst_vacpage->offsets_used = 0;
vacuum_page(rel, dst_buf, dst_vacpage);
vacuum_page(vacrelstats, rel, dst_buf, dst_vacpage);
dst_vacpage->offsets_used = sv_offsets_used;
}
@@ -3367,7 +3372,7 @@ vacuum_heap(VRelStats *vacrelstats, Relation onerel, VacPageList vacuum_pages)
buf = ReadBufferExtended(onerel, MAIN_FORKNUM, (*vacpage)->blkno,
RBM_NORMAL, vac_strategy);
LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
vacuum_page(onerel, buf, *vacpage);
vacuum_page(vacrelstats, onerel, buf, *vacpage);
UnlockReleaseBuffer(buf);
}
}
@@ -3397,7 +3402,7 @@ vacuum_heap(VRelStats *vacrelstats, Relation onerel, VacPageList vacuum_pages)
* Caller must hold pin and lock on buffer.
*/
static void
vacuum_page(Relation onerel, Buffer buffer, VacPage vacpage)
vacuum_page(VRelStats *vacrelstats, Relation onerel, Buffer buffer, VacPage vacpage)
{
Page page = BufferGetPage(buffer);
int i;
@@ -3426,7 +3431,7 @@ vacuum_page(Relation onerel, Buffer buffer, VacPage vacpage)
recptr = log_heap_clean(onerel, buffer,
NULL, 0, NULL, 0,
vacpage->offsets, vacpage->offsets_free,
false);
vacrelstats->latestRemovedXid, false);
PageSetLSN(page, recptr);
PageSetTLI(page, ThisTimeLineID);
}

View File

@@ -29,7 +29,7 @@
*
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/commands/vacuumlazy.c,v 1.124 2009/11/16 21:32:06 tgl Exp $
* $PostgreSQL: pgsql/src/backend/commands/vacuumlazy.c,v 1.125 2009/12/19 01:32:34 sriggs Exp $
*
*-------------------------------------------------------------------------
*/
@@ -98,6 +98,7 @@ typedef struct LVRelStats
int max_dead_tuples; /* # slots allocated in array */
ItemPointer dead_tuples; /* array of ItemPointerData */
int num_index_scans;
TransactionId latestRemovedXid;
} LVRelStats;
@@ -265,6 +266,34 @@ lazy_vacuum_rel(Relation onerel, VacuumStmt *vacstmt,
return heldoff;
}
/*
* For Hot Standby we need to know the highest transaction id that will
* be removed by any change. VACUUM proceeds in a number of passes so
* we need to consider how each pass operates. The first phase runs
* heap_page_prune(), which can issue XLOG_HEAP2_CLEAN records as it
* progresses - these will have a latestRemovedXid on each record.
* In some cases this removes all of the tuples to be removed, though
* often we have dead tuples with index pointers so we must remember them
* for removal in phase 3. Index records for those rows are removed
* in phase 2 and index blocks do not have MVCC information attached.
* So before we can allow removal of any index tuples we need to issue
* a WAL record containing the latestRemovedXid of rows that will be
* removed in phase three. This allows recovery queries to block at the
* correct place, i.e. before phase two, rather than during phase three
* which would be after the rows have become inaccessible.
*/
static void
vacuum_log_cleanup_info(Relation rel, LVRelStats *vacrelstats)
{
/*
* No need to log changes for temp tables, they do not contain
* data visible on the standby server.
*/
if (rel->rd_istemp || !XLogArchivingActive())
return;
(void) log_heap_cleanup_info(rel->rd_node, vacrelstats->latestRemovedXid);
}
/*
* lazy_scan_heap() -- scan an open heap relation
@@ -315,6 +344,7 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
nblocks = RelationGetNumberOfBlocks(onerel);
vacrelstats->rel_pages = nblocks;
vacrelstats->nonempty_pages = 0;
vacrelstats->latestRemovedXid = InvalidTransactionId;
lazy_space_alloc(vacrelstats, nblocks);
@@ -373,6 +403,9 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
if ((vacrelstats->max_dead_tuples - vacrelstats->num_dead_tuples) < MaxHeapTuplesPerPage &&
vacrelstats->num_dead_tuples > 0)
{
/* Log cleanup info before we touch indexes */
vacuum_log_cleanup_info(onerel, vacrelstats);
/* Remove index entries */
for (i = 0; i < nindexes; i++)
lazy_vacuum_index(Irel[i],
@@ -382,6 +415,7 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
lazy_vacuum_heap(onerel, vacrelstats);
/* Forget the now-vacuumed tuples, and press on */
vacrelstats->num_dead_tuples = 0;
vacrelstats->latestRemovedXid = InvalidTransactionId;
vacrelstats->num_index_scans++;
}
@@ -613,6 +647,8 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
if (tupgone)
{
lazy_record_dead_tuple(vacrelstats, &(tuple.t_self));
HeapTupleHeaderAdvanceLatestRemovedXid(tuple.t_data,
&vacrelstats->latestRemovedXid);
tups_vacuumed += 1;
}
else
@@ -661,6 +697,7 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
lazy_vacuum_page(onerel, blkno, buf, 0, vacrelstats);
/* Forget the now-vacuumed tuples, and press on */
vacrelstats->num_dead_tuples = 0;
vacrelstats->latestRemovedXid = InvalidTransactionId;
vacuumed_pages++;
}
@@ -724,6 +761,9 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
/* XXX put a threshold on min number of tuples here? */
if (vacrelstats->num_dead_tuples > 0)
{
/* Log cleanup info before we touch indexes */
vacuum_log_cleanup_info(onerel, vacrelstats);
/* Remove index entries */
for (i = 0; i < nindexes; i++)
lazy_vacuum_index(Irel[i],
@@ -868,7 +908,7 @@ lazy_vacuum_page(Relation onerel, BlockNumber blkno, Buffer buffer,
recptr = log_heap_clean(onerel, buffer,
NULL, 0, NULL, 0,
unused, uncnt,
false);
vacrelstats->latestRemovedXid, false);
PageSetLSN(page, recptr);
PageSetTLI(page, ThisTimeLineID);
}