mirror of
				https://github.com/postgres/postgres.git
				synced 2025-10-25 13:17:41 +03:00 
			
		
		
		
	Fix BRIN to use SnapshotAny during summarization
For correctness of summarization results, it is critical that the snapshot used during the summarization scan is able to see all tuples that are live to all transactions -- including tuples inserted or deleted by in-progress transactions. Otherwise, it would be possible for a transaction to insert a tuple, then idle for a long time while a concurrent transaction executes summarization of the range: this would result in the inserted value not being considered in the summary. Previously we were trying to use a MVCC snapshot in conjunction with adding a "placeholder" tuple in the index: the snapshot would see all committed tuples, and the placeholder tuple would catch insertions by any new inserters. The hole is that prior insertions by transactions that are still in progress by the time the MVCC snapshot was taken were ignored. Kevin Grittner reported this as a bogus error message during vacuum with default transaction isolation mode set to repeatable read (because the error report mentioned a function name not being invoked during), but the problem is larger than that. To fix, tweak IndexBuildHeapRangeScan to have a new mode that behaves the way we need using SnapshotAny visibility rules. This change simplifies the BRIN code a bit, mainly by removing large comments that were mistaken. Instead, rely on the SnapshotAny semantics to provide what it needs. (The business about a placeholder tuple needs to remain: that covers the case that a transaction inserts a a tuple in a page that summarization already scanned.) Discussion: https://www.postgresql.org/message-id/20150731175700.GX2441@postgresql.org In passing, remove a couple of unused declarations from brin.h and reword a comment to be proper English. This part submitted by Kevin Grittner. Backpatch to 9.5, where BRIN was introduced.
This commit is contained in:
		| @@ -696,7 +696,7 @@ brinbuildempty(PG_FUNCTION_ARGS) | |||||||
|  * |  * | ||||||
|  * XXX we could mark item tuples as "dirty" (when a minimum or maximum heap |  * XXX we could mark item tuples as "dirty" (when a minimum or maximum heap | ||||||
|  * tuple is deleted), meaning the need to re-run summarization on the affected |  * tuple is deleted), meaning the need to re-run summarization on the affected | ||||||
|  * range.  Need to an extra flag in brintuples for that. |  * range.  Would need to add an extra flag in brintuples for that. | ||||||
|  */ |  */ | ||||||
| Datum | Datum | ||||||
| brinbulkdelete(PG_FUNCTION_ARGS) | brinbulkdelete(PG_FUNCTION_ARGS) | ||||||
| @@ -951,9 +951,13 @@ summarize_range(IndexInfo *indexInfo, BrinBuildState *state, Relation heapRel, | |||||||
| 	 * Execute the partial heap scan covering the heap blocks in the specified | 	 * Execute the partial heap scan covering the heap blocks in the specified | ||||||
| 	 * page range, summarizing the heap tuples in it.  This scan stops just | 	 * page range, summarizing the heap tuples in it.  This scan stops just | ||||||
| 	 * short of brinbuildCallback creating the new index entry. | 	 * short of brinbuildCallback creating the new index entry. | ||||||
|  | 	 * | ||||||
|  | 	 * Note that it is critical we use the "any visible" mode of | ||||||
|  | 	 * IndexBuildHeapRangeScan here: otherwise, we would miss tuples inserted | ||||||
|  | 	 * by transactions that are still in progress, among other corner cases. | ||||||
| 	 */ | 	 */ | ||||||
| 	state->bs_currRangeStart = heapBlk; | 	state->bs_currRangeStart = heapBlk; | ||||||
| 	IndexBuildHeapRangeScan(heapRel, state->bs_irel, indexInfo, false, | 	IndexBuildHeapRangeScan(heapRel, state->bs_irel, indexInfo, false, true, | ||||||
| 							heapBlk, state->bs_pagesPerRange, | 							heapBlk, state->bs_pagesPerRange, | ||||||
| 							brinbuildCallback, (void *) state); | 							brinbuildCallback, (void *) state); | ||||||
|  |  | ||||||
| @@ -1058,36 +1062,6 @@ brinsummarize(Relation index, Relation heapRel, double *numSummarized, | |||||||
| 				state = initialize_brin_buildstate(index, revmap, | 				state = initialize_brin_buildstate(index, revmap, | ||||||
| 												   pagesPerRange); | 												   pagesPerRange); | ||||||
| 				indexInfo = BuildIndexInfo(index); | 				indexInfo = BuildIndexInfo(index); | ||||||
|  |  | ||||||
| 				/* |  | ||||||
| 				 * We only have ShareUpdateExclusiveLock on the table, and |  | ||||||
| 				 * therefore other sessions may insert tuples into the range |  | ||||||
| 				 * we're going to scan.  This is okay, because we take |  | ||||||
| 				 * additional precautions to avoid losing the additional |  | ||||||
| 				 * tuples; see comments in summarize_range.  Set the |  | ||||||
| 				 * concurrent flag, which causes IndexBuildHeapRangeScan to |  | ||||||
| 				 * use a snapshot other than SnapshotAny, and silences |  | ||||||
| 				 * warnings emitted there. |  | ||||||
| 				 */ |  | ||||||
| 				indexInfo->ii_Concurrent = true; |  | ||||||
|  |  | ||||||
| 				/* |  | ||||||
| 				 * If using transaction-snapshot mode, it would be possible |  | ||||||
| 				 * for another transaction to insert a tuple that's not |  | ||||||
| 				 * visible to our snapshot if we have already acquired one, |  | ||||||
| 				 * when in snapshot-isolation mode; therefore, disallow this |  | ||||||
| 				 * from running in such a transaction unless a snapshot hasn't |  | ||||||
| 				 * been acquired yet. |  | ||||||
| 				 * |  | ||||||
| 				 * This code is called by VACUUM and |  | ||||||
| 				 * brin_summarize_new_values. Have the error message mention |  | ||||||
| 				 * the latter because VACUUM cannot run in a transaction and |  | ||||||
| 				 * thus cannot cause this issue. |  | ||||||
| 				 */ |  | ||||||
| 				if (IsolationUsesXactSnapshot() && FirstSnapshotSet) |  | ||||||
| 					ereport(ERROR, |  | ||||||
| 							(errcode(ERRCODE_INVALID_TRANSACTION_STATE), |  | ||||||
| 							 errmsg("brin_summarize_new_values() cannot run in a transaction that has already obtained a snapshot"))); |  | ||||||
| 			} | 			} | ||||||
| 			summarize_range(indexInfo, state, heapRel, heapBlk); | 			summarize_range(indexInfo, state, heapRel, heapBlk); | ||||||
|  |  | ||||||
| @@ -1111,7 +1085,10 @@ brinsummarize(Relation index, Relation heapRel, double *numSummarized, | |||||||
| 	/* free resources */ | 	/* free resources */ | ||||||
| 	brinRevmapTerminate(revmap); | 	brinRevmapTerminate(revmap); | ||||||
| 	if (state) | 	if (state) | ||||||
|  | 	{ | ||||||
| 		terminate_brin_buildstate(state); | 		terminate_brin_buildstate(state); | ||||||
|  | 		pfree(indexInfo); | ||||||
|  | 	} | ||||||
| } | } | ||||||
|  |  | ||||||
| /* | /* | ||||||
|   | |||||||
| @@ -2161,6 +2161,7 @@ IndexBuildHeapScan(Relation heapRelation, | |||||||
| { | { | ||||||
| 	return IndexBuildHeapRangeScan(heapRelation, indexRelation, | 	return IndexBuildHeapRangeScan(heapRelation, indexRelation, | ||||||
| 								   indexInfo, allow_sync, | 								   indexInfo, allow_sync, | ||||||
|  | 								   false, | ||||||
| 								   0, InvalidBlockNumber, | 								   0, InvalidBlockNumber, | ||||||
| 								   callback, callback_state); | 								   callback, callback_state); | ||||||
| } | } | ||||||
| @@ -2170,12 +2171,17 @@ IndexBuildHeapScan(Relation heapRelation, | |||||||
|  * number of blocks are scanned.  Scan to end-of-rel can be signalled by |  * number of blocks are scanned.  Scan to end-of-rel can be signalled by | ||||||
|  * passing InvalidBlockNumber as numblocks.  Note that restricting the range |  * passing InvalidBlockNumber as numblocks.  Note that restricting the range | ||||||
|  * to scan cannot be done when requesting syncscan. |  * to scan cannot be done when requesting syncscan. | ||||||
|  |  * | ||||||
|  |  * When "anyvisible" mode is requested, all tuples visible to any transaction | ||||||
|  |  * are considered, including those inserted or deleted by transactions that are | ||||||
|  |  * still in progress. | ||||||
|  */ |  */ | ||||||
| double | double | ||||||
| IndexBuildHeapRangeScan(Relation heapRelation, | IndexBuildHeapRangeScan(Relation heapRelation, | ||||||
| 						Relation indexRelation, | 						Relation indexRelation, | ||||||
| 						IndexInfo *indexInfo, | 						IndexInfo *indexInfo, | ||||||
| 						bool allow_sync, | 						bool allow_sync, | ||||||
|  | 						bool anyvisible, | ||||||
| 						BlockNumber start_blockno, | 						BlockNumber start_blockno, | ||||||
| 						BlockNumber numblocks, | 						BlockNumber numblocks, | ||||||
| 						IndexBuildCallback callback, | 						IndexBuildCallback callback, | ||||||
| @@ -2209,6 +2215,12 @@ IndexBuildHeapRangeScan(Relation heapRelation, | |||||||
| 	checking_uniqueness = (indexInfo->ii_Unique || | 	checking_uniqueness = (indexInfo->ii_Unique || | ||||||
| 						   indexInfo->ii_ExclusionOps != NULL); | 						   indexInfo->ii_ExclusionOps != NULL); | ||||||
|  |  | ||||||
|  | 	/* | ||||||
|  | 	 * "Any visible" mode is not compatible with uniqueness checks; make sure | ||||||
|  | 	 * only one of those is requested. | ||||||
|  | 	 */ | ||||||
|  | 	Assert(!(anyvisible && checking_uniqueness)); | ||||||
|  |  | ||||||
| 	/* | 	/* | ||||||
| 	 * Need an EState for evaluation of index expressions and partial-index | 	 * Need an EState for evaluation of index expressions and partial-index | ||||||
| 	 * predicates.  Also a slot to hold the current tuple. | 	 * predicates.  Also a slot to hold the current tuple. | ||||||
| @@ -2236,6 +2248,9 @@ IndexBuildHeapRangeScan(Relation heapRelation, | |||||||
| 	{ | 	{ | ||||||
| 		snapshot = RegisterSnapshot(GetTransactionSnapshot()); | 		snapshot = RegisterSnapshot(GetTransactionSnapshot()); | ||||||
| 		OldestXmin = InvalidTransactionId;		/* not used */ | 		OldestXmin = InvalidTransactionId;		/* not used */ | ||||||
|  |  | ||||||
|  | 		/* "any visible" mode is not compatible with this */ | ||||||
|  | 		Assert(!anyvisible); | ||||||
| 	} | 	} | ||||||
| 	else | 	else | ||||||
| 	{ | 	{ | ||||||
| @@ -2363,6 +2378,17 @@ IndexBuildHeapRangeScan(Relation heapRelation, | |||||||
| 					break; | 					break; | ||||||
| 				case HEAPTUPLE_INSERT_IN_PROGRESS: | 				case HEAPTUPLE_INSERT_IN_PROGRESS: | ||||||
|  |  | ||||||
|  | 					/* | ||||||
|  | 					 * In "anyvisible" mode, this tuple is visible and we don't | ||||||
|  | 					 * need any further checks. | ||||||
|  | 					 */ | ||||||
|  | 					if (anyvisible) | ||||||
|  | 					{ | ||||||
|  | 						indexIt = true; | ||||||
|  | 						tupleIsAlive = true; | ||||||
|  | 						break; | ||||||
|  | 					} | ||||||
|  |  | ||||||
| 					/* | 					/* | ||||||
| 					 * Since caller should hold ShareLock or better, normally | 					 * Since caller should hold ShareLock or better, normally | ||||||
| 					 * the only way to see this is if it was inserted earlier | 					 * the only way to see this is if it was inserted earlier | ||||||
| @@ -2409,8 +2435,16 @@ IndexBuildHeapRangeScan(Relation heapRelation, | |||||||
|  |  | ||||||
| 					/* | 					/* | ||||||
| 					 * As with INSERT_IN_PROGRESS case, this is unexpected | 					 * As with INSERT_IN_PROGRESS case, this is unexpected | ||||||
| 					 * unless it's our own deletion or a system catalog. | 					 * unless it's our own deletion or a system catalog; | ||||||
|  | 					 * but in anyvisible mode, this tuple is visible. | ||||||
| 					 */ | 					 */ | ||||||
|  | 					if (anyvisible) | ||||||
|  | 					{ | ||||||
|  | 						indexIt = true; | ||||||
|  | 						tupleIsAlive = false; | ||||||
|  | 						break; | ||||||
|  | 					} | ||||||
|  |  | ||||||
| 					xwait = HeapTupleHeaderGetUpdateXid(heapTuple->t_data); | 					xwait = HeapTupleHeaderGetUpdateXid(heapTuple->t_data); | ||||||
| 					if (!TransactionIdIsCurrentTransactionId(xwait)) | 					if (!TransactionIdIsCurrentTransactionId(xwait)) | ||||||
| 					{ | 					{ | ||||||
|   | |||||||
| @@ -22,7 +22,6 @@ extern Datum brinbuild(PG_FUNCTION_ARGS); | |||||||
| extern Datum brinbuildempty(PG_FUNCTION_ARGS); | extern Datum brinbuildempty(PG_FUNCTION_ARGS); | ||||||
| extern Datum brininsert(PG_FUNCTION_ARGS); | extern Datum brininsert(PG_FUNCTION_ARGS); | ||||||
| extern Datum brinbeginscan(PG_FUNCTION_ARGS); | extern Datum brinbeginscan(PG_FUNCTION_ARGS); | ||||||
| extern Datum bringettuple(PG_FUNCTION_ARGS); |  | ||||||
| extern Datum bringetbitmap(PG_FUNCTION_ARGS); | extern Datum bringetbitmap(PG_FUNCTION_ARGS); | ||||||
| extern Datum brinrescan(PG_FUNCTION_ARGS); | extern Datum brinrescan(PG_FUNCTION_ARGS); | ||||||
| extern Datum brinendscan(PG_FUNCTION_ARGS); | extern Datum brinendscan(PG_FUNCTION_ARGS); | ||||||
| @@ -30,7 +29,6 @@ extern Datum brinmarkpos(PG_FUNCTION_ARGS); | |||||||
| extern Datum brinrestrpos(PG_FUNCTION_ARGS); | extern Datum brinrestrpos(PG_FUNCTION_ARGS); | ||||||
| extern Datum brinbulkdelete(PG_FUNCTION_ARGS); | extern Datum brinbulkdelete(PG_FUNCTION_ARGS); | ||||||
| extern Datum brinvacuumcleanup(PG_FUNCTION_ARGS); | extern Datum brinvacuumcleanup(PG_FUNCTION_ARGS); | ||||||
| extern Datum brincanreturn(PG_FUNCTION_ARGS); |  | ||||||
| extern Datum brincostestimate(PG_FUNCTION_ARGS); | extern Datum brincostestimate(PG_FUNCTION_ARGS); | ||||||
| extern Datum brinoptions(PG_FUNCTION_ARGS); | extern Datum brinoptions(PG_FUNCTION_ARGS); | ||||||
|  |  | ||||||
|   | |||||||
| @@ -105,6 +105,7 @@ extern double IndexBuildHeapRangeScan(Relation heapRelation, | |||||||
| 						Relation indexRelation, | 						Relation indexRelation, | ||||||
| 						IndexInfo *indexInfo, | 						IndexInfo *indexInfo, | ||||||
| 						bool allow_sync, | 						bool allow_sync, | ||||||
|  | 						bool anyvisible, | ||||||
| 						BlockNumber start_blockno, | 						BlockNumber start_blockno, | ||||||
| 						BlockNumber end_blockno, | 						BlockNumber end_blockno, | ||||||
| 						IndexBuildCallback callback, | 						IndexBuildCallback callback, | ||||||
|   | |||||||
							
								
								
									
										39
									
								
								src/test/isolation/expected/brin-1.out
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										39
									
								
								src/test/isolation/expected/brin-1.out
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,39 @@ | |||||||
|  | Parsed test spec with 2 sessions | ||||||
|  |  | ||||||
|  | starting permutation: s2check s1b s2b s1i s2summ s1c s2c s2check | ||||||
|  | step s2check: SELECT * FROM brin_page_items(get_raw_page('brinidx', 2), 'brinidx'::regclass); | ||||||
|  | itemoffset     blknum         attnum         allnulls       hasnulls       placeholder    value           | ||||||
|  |  | ||||||
|  | 1              0              1              f              f              f              {1 .. 1}        | ||||||
|  | step s1b: BEGIN ISOLATION LEVEL REPEATABLE READ; | ||||||
|  | step s2b: BEGIN ISOLATION LEVEL REPEATABLE READ; SELECT 1; | ||||||
|  | ?column?        | ||||||
|  |  | ||||||
|  | 1               | ||||||
|  | step s1i: INSERT INTO brin_iso VALUES (1000); | ||||||
|  | step s2summ: SELECT brin_summarize_new_values('brinidx'::regclass); | ||||||
|  | brin_summarize_new_values | ||||||
|  |  | ||||||
|  | 1               | ||||||
|  | step s1c: COMMIT; | ||||||
|  | step s2c: COMMIT; | ||||||
|  | step s2check: SELECT * FROM brin_page_items(get_raw_page('brinidx', 2), 'brinidx'::regclass); | ||||||
|  | itemoffset     blknum         attnum         allnulls       hasnulls       placeholder    value           | ||||||
|  |  | ||||||
|  | 1              0              1              f              f              f              {1 .. 1}        | ||||||
|  | 2              1              1              f              f              f              {1 .. 1000}     | ||||||
|  |  | ||||||
|  | starting permutation: s2check s1b s1i s2vacuum s1c s2check | ||||||
|  | step s2check: SELECT * FROM brin_page_items(get_raw_page('brinidx', 2), 'brinidx'::regclass); | ||||||
|  | itemoffset     blknum         attnum         allnulls       hasnulls       placeholder    value           | ||||||
|  |  | ||||||
|  | 1              0              1              f              f              f              {1 .. 1}        | ||||||
|  | step s1b: BEGIN ISOLATION LEVEL REPEATABLE READ; | ||||||
|  | step s1i: INSERT INTO brin_iso VALUES (1000); | ||||||
|  | step s2vacuum: VACUUM brin_iso; | ||||||
|  | step s1c: COMMIT; | ||||||
|  | step s2check: SELECT * FROM brin_page_items(get_raw_page('brinidx', 2), 'brinidx'::regclass); | ||||||
|  | itemoffset     blknum         attnum         allnulls       hasnulls       placeholder    value           | ||||||
|  |  | ||||||
|  | 1              0              1              f              f              f              {1 .. 1}        | ||||||
|  | 2              1              1              f              f              f              {1 .. 1000}     | ||||||
| @@ -36,6 +36,7 @@ test: skip-locked | |||||||
| test: skip-locked-2 | test: skip-locked-2 | ||||||
| test: skip-locked-3 | test: skip-locked-3 | ||||||
| test: skip-locked-4 | test: skip-locked-4 | ||||||
|  | test: brin-1 | ||||||
| test: drop-index-concurrently-1 | test: drop-index-concurrently-1 | ||||||
| test: alter-table-1 | test: alter-table-1 | ||||||
| test: alter-table-2 | test: alter-table-2 | ||||||
|   | |||||||
							
								
								
									
										44
									
								
								src/test/isolation/specs/brin-1.spec
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										44
									
								
								src/test/isolation/specs/brin-1.spec
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,44 @@ | |||||||
|  | # This test verifies that values inserted in transactions still in progress | ||||||
|  | # are considered during concurrent range summarization (either using the | ||||||
|  | # brin_summarize_new_values function or regular VACUUM). | ||||||
|  |  | ||||||
|  | setup | ||||||
|  | { | ||||||
|  |     CREATE TABLE brin_iso ( | ||||||
|  |         value int | ||||||
|  |     ) WITH (fillfactor=10); | ||||||
|  |     CREATE INDEX brinidx ON brin_iso USING brin (value) WITH (pages_per_range=1); | ||||||
|  |     -- this fills the first page | ||||||
|  |     DO $$ | ||||||
|  |     DECLARE curtid tid; | ||||||
|  |     BEGIN | ||||||
|  |       LOOP | ||||||
|  |         INSERT INTO brin_iso VALUES (1) RETURNING ctid INTO curtid; | ||||||
|  |         EXIT WHEN curtid > tid '(1, 0)'; | ||||||
|  |       END LOOP; | ||||||
|  |     END; | ||||||
|  |     $$; | ||||||
|  |     CREATE EXTENSION IF NOT EXISTS pageinspect; | ||||||
|  | } | ||||||
|  |  | ||||||
|  | teardown | ||||||
|  | { | ||||||
|  |     DROP TABLE brin_iso; | ||||||
|  | } | ||||||
|  |  | ||||||
|  | session "s1" | ||||||
|  | step "s1b"		{ BEGIN ISOLATION LEVEL REPEATABLE READ; } | ||||||
|  | step "s1i"		{ INSERT INTO brin_iso VALUES (1000); } | ||||||
|  | step "s1c"		{ COMMIT; } | ||||||
|  |  | ||||||
|  | session "s2" | ||||||
|  | step "s2b"		{ BEGIN ISOLATION LEVEL REPEATABLE READ; SELECT 1; } | ||||||
|  | step "s2summ"	{ SELECT brin_summarize_new_values('brinidx'::regclass); } | ||||||
|  | step "s2c"		{ COMMIT; } | ||||||
|  |  | ||||||
|  | step "s2vacuum"	{ VACUUM brin_iso; } | ||||||
|  |  | ||||||
|  | step "s2check"	{ SELECT * FROM brin_page_items(get_raw_page('brinidx', 2), 'brinidx'::regclass); } | ||||||
|  |  | ||||||
|  | permutation "s2check" "s1b" "s2b" "s1i" "s2summ" "s1c" "s2c" "s2check" | ||||||
|  | permutation "s2check" "s1b" "s1i" "s2vacuum" "s1c" "s2check" | ||||||
		Reference in New Issue
	
	Block a user