mirror of
				https://github.com/postgres/postgres.git
				synced 2025-10-25 13:17:41 +03:00 
			
		
		
		
	Introduce WAL records to log reuse of btree pages, allowing conflict
resolution during Hot Standby. Page reuse interlock requested by Tom. Analysis and patch by me.
This commit is contained in:
		| @@ -9,7 +9,7 @@ | ||||
|  * | ||||
|  * | ||||
|  * IDENTIFICATION | ||||
|  *	  $PostgreSQL: pgsql/src/backend/access/nbtree/nbtpage.c,v 1.118 2010/02/08 04:33:53 tgl Exp $ | ||||
|  *	  $PostgreSQL: pgsql/src/backend/access/nbtree/nbtpage.c,v 1.119 2010/02/13 00:59:58 sriggs Exp $ | ||||
|  * | ||||
|  *	NOTES | ||||
|  *	   Postgres btree pages look like ordinary relation pages.	The opaque | ||||
| @@ -446,6 +446,48 @@ _bt_checkpage(Relation rel, Buffer buf) | ||||
| 				 errhint("Please REINDEX it."))); | ||||
| } | ||||
|  | ||||
| /* | ||||
|  * Log the reuse of a page from the FSM. | ||||
|  */ | ||||
| static void | ||||
| _bt_log_reuse_page(Relation rel, BlockNumber blkno, TransactionId latestRemovedXid) | ||||
| { | ||||
| 	if (rel->rd_istemp) | ||||
| 		return; | ||||
|  | ||||
| 	/* No ereport(ERROR) until changes are logged */ | ||||
| 	START_CRIT_SECTION(); | ||||
|  | ||||
| 	/* | ||||
| 	 * We don't do MarkBufferDirty here because we're about initialise | ||||
| 	 * the page, and nobody else can see it yet. | ||||
| 	 */ | ||||
|  | ||||
| 	/* XLOG stuff */ | ||||
| 	{ | ||||
| 		XLogRecPtr	recptr; | ||||
| 		XLogRecData rdata[1]; | ||||
| 		xl_btree_reuse_page xlrec_reuse; | ||||
|  | ||||
| 		xlrec_reuse.node = rel->rd_node; | ||||
| 		xlrec_reuse.block = blkno; | ||||
| 		xlrec_reuse.latestRemovedXid = latestRemovedXid; | ||||
| 		rdata[0].data = (char *) &xlrec_reuse; | ||||
| 		rdata[0].len = SizeOfBtreeReusePage; | ||||
| 		rdata[0].buffer = InvalidBuffer; | ||||
| 		rdata[0].next = NULL; | ||||
|  | ||||
| 		recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_REUSE_PAGE, rdata); | ||||
|  | ||||
| 		/* | ||||
| 		 * We don't do PageSetLSN or PageSetTLI here because | ||||
| 		 * we're about initialise the page, so no need. | ||||
| 		 */ | ||||
| 	} | ||||
|  | ||||
| 	END_CRIT_SECTION(); | ||||
| } | ||||
|  | ||||
| /* | ||||
|  *	_bt_getbuf() -- Get a buffer by block number for read or write. | ||||
|  * | ||||
| @@ -510,7 +552,19 @@ _bt_getbuf(Relation rel, BlockNumber blkno, int access) | ||||
| 			{ | ||||
| 				page = BufferGetPage(buf); | ||||
| 				if (_bt_page_recyclable(page)) | ||||
| 				{ | ||||
| 				{					 | ||||
| 					/* | ||||
| 					 * If we are generating WAL for Hot Standby then create | ||||
| 					 * a WAL record that will allow us to conflict with | ||||
| 					 * queries running on standby. | ||||
| 					 */ | ||||
| 					if (XLogStandbyInfoActive()) | ||||
| 					{ | ||||
| 						BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page); | ||||
|  | ||||
| 						_bt_log_reuse_page(rel, blkno, opaque->btpo.xact); | ||||
| 					} | ||||
|  | ||||
| 					/* Okay to use page.  Re-initialize and return it */ | ||||
| 					_bt_pageinit(page, BufferGetPageSize(buf)); | ||||
| 					return buf; | ||||
|   | ||||
| @@ -8,7 +8,7 @@ | ||||
|  * Portions Copyright (c) 1994, Regents of the University of California | ||||
|  * | ||||
|  * IDENTIFICATION | ||||
|  *	  $PostgreSQL: pgsql/src/backend/access/nbtree/nbtxlog.c,v 1.60 2010/02/08 04:33:53 tgl Exp $ | ||||
|  *	  $PostgreSQL: pgsql/src/backend/access/nbtree/nbtxlog.c,v 1.61 2010/02/13 00:59:58 sriggs Exp $ | ||||
|  * | ||||
|  *------------------------------------------------------------------------- | ||||
|  */ | ||||
| @@ -814,26 +814,48 @@ btree_redo(XLogRecPtr lsn, XLogRecord *record) | ||||
| { | ||||
| 	uint8		info = record->xl_info & ~XLR_INFO_MASK; | ||||
|  | ||||
| 	/* | ||||
| 	 * Btree delete records can conflict with standby queries. You might | ||||
| 	 * think that vacuum records would conflict as well, but we've handled | ||||
| 	 * that already. XLOG_HEAP2_CLEANUP_INFO records provide the highest xid | ||||
| 	 * cleaned by the vacuum of the heap and so we can resolve any conflicts | ||||
| 	 * just once when that arrives. After that any we know that no conflicts | ||||
| 	 * exist from individual btree vacuum records on that index. | ||||
| 	 */ | ||||
| 	if (InHotStandby && info == XLOG_BTREE_DELETE) | ||||
| 	if (InHotStandby) | ||||
| 	{ | ||||
| 		xl_btree_delete *xlrec = (xl_btree_delete *) XLogRecGetData(record); | ||||
| 		switch (info) | ||||
| 		{ | ||||
| 			case XLOG_BTREE_DELETE: | ||||
| 				/* | ||||
| 				 * Btree delete records can conflict with standby queries. You might | ||||
| 				 * think that vacuum records would conflict as well, but we've handled | ||||
| 				 * that already. XLOG_HEAP2_CLEANUP_INFO records provide the highest xid | ||||
| 				 * cleaned by the vacuum of the heap and so we can resolve any conflicts | ||||
| 				 * just once when that arrives. After that any we know that no conflicts | ||||
| 				 * exist from individual btree vacuum records on that index. | ||||
| 				 */ | ||||
| 				{ | ||||
| 					xl_btree_delete *xlrec = (xl_btree_delete *) XLogRecGetData(record); | ||||
|  | ||||
| 		/* | ||||
| 		 * XXX Currently we put everybody on death row, because | ||||
| 		 * currently _bt_delitems() supplies InvalidTransactionId. | ||||
| 		 * This can be fairly painful, so providing a better value | ||||
| 		 * here is worth some thought and possibly some effort to | ||||
| 		 * improve. | ||||
| 		 */ | ||||
| 		ResolveRecoveryConflictWithSnapshot(xlrec->latestRemovedXid, xlrec->node); | ||||
| 					/* | ||||
| 					 * XXX Currently we put everybody on death row, because | ||||
| 					 * currently _bt_delitems() supplies InvalidTransactionId. | ||||
| 					 * This can be fairly painful, so providing a better value | ||||
| 					 * here is worth some thought and possibly some effort to | ||||
| 					 * improve. | ||||
| 					 */ | ||||
| 					ResolveRecoveryConflictWithSnapshot(xlrec->latestRemovedXid, xlrec->node); | ||||
| 				} | ||||
| 				break; | ||||
|  | ||||
| 			case XLOG_BTREE_REUSE_PAGE: | ||||
| 				/* | ||||
| 				 * Btree reuse page records exist to provide a conflict point when we | ||||
| 				 * reuse pages in the index via the FSM. That's all it does though. | ||||
| 				 */ | ||||
| 				{ | ||||
| 					xl_btree_reuse_page *xlrec = (xl_btree_reuse_page *) XLogRecGetData(record); | ||||
|  | ||||
| 					ResolveRecoveryConflictWithSnapshot(xlrec->latestRemovedXid, xlrec->node); | ||||
| 				} | ||||
| 				return; | ||||
|  | ||||
| 			default: | ||||
| 				break; | ||||
| 		} | ||||
| 	} | ||||
|  | ||||
| 	/* | ||||
|   | ||||
| @@ -7,7 +7,7 @@ | ||||
|  * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group | ||||
|  * Portions Copyright (c) 1994, Regents of the University of California | ||||
|  * | ||||
|  * $PostgreSQL: pgsql/src/include/access/nbtree.h,v 1.128 2010/02/08 04:33:54 tgl Exp $ | ||||
|  * $PostgreSQL: pgsql/src/include/access/nbtree.h,v 1.129 2010/02/13 00:59:58 sriggs Exp $ | ||||
|  * | ||||
|  *------------------------------------------------------------------------- | ||||
|  */ | ||||
| @@ -221,6 +221,7 @@ typedef struct BTMetaPageData | ||||
| #define XLOG_BTREE_DELETE_PAGE_HALF 0xB0		/* page deletion that makes | ||||
| 												 * parent half-dead */ | ||||
| #define XLOG_BTREE_VACUUM		0xC0	/* delete entries on a page during vacuum */ | ||||
| #define XLOG_BTREE_REUSE_PAGE	0xD0	/* old page is about to be reused from FSM */ | ||||
|  | ||||
| /* | ||||
|  * All that we need to find changed index tuple | ||||
| @@ -321,6 +322,18 @@ typedef struct xl_btree_delete | ||||
|  | ||||
| #define SizeOfBtreeDelete	(offsetof(xl_btree_delete, latestRemovedXid) + sizeof(TransactionId)) | ||||
|  | ||||
| /* | ||||
|  * This is what we need to know about page reuse within btree. | ||||
|  */ | ||||
| typedef struct xl_btree_reuse_page | ||||
| { | ||||
| 	RelFileNode node; | ||||
| 	BlockNumber block; | ||||
| 	TransactionId	latestRemovedXid; | ||||
| } xl_btree_reuse_page; | ||||
|  | ||||
| #define SizeOfBtreeReusePage	(sizeof(xl_btree_reuse_page)) | ||||
|  | ||||
| /* | ||||
|  * This is what we need to know about vacuum of individual leaf index tuples. | ||||
|  * The WAL record can represent deletion of any number of index tuples on a | ||||
|   | ||||
		Reference in New Issue
	
	Block a user