mirror of
				https://github.com/postgres/postgres.git
				synced 2025-10-22 14:32:25 +03:00 
			
		
		
		
	amcheck: Distinguish interrupted page deletion from corruption.
This prevents false-positive reports about "the first child of leftmost target page is not leftmost of its level", "block %u is not leftmost" and "left link/right link pair". They appeared if amcheck ran before VACUUM cleaned things, after a cluster exited recovery between the first-stage and second-stage WAL records of a deletion. Back-patch to v11 (all supported versions). Reviewed by Peter Geoghegan. Discussion: https://postgr.es/m/20231005025232.c7.nmisch@google.com
This commit is contained in:
		| @@ -12,6 +12,7 @@ PGFILEDESC = "amcheck - function for verifying relation integrity" | ||||
|  | ||||
| REGRESS = check check_btree check_heap | ||||
|  | ||||
| EXTRA_INSTALL = contrib/pg_walinspect | ||||
| TAP_TESTS = 1 | ||||
|  | ||||
| ifdef USE_PGXS | ||||
|   | ||||
| @@ -42,6 +42,7 @@ tests += { | ||||
|       't/001_verify_heapam.pl', | ||||
|       't/002_cic.pl', | ||||
|       't/003_cic_2pc.pl', | ||||
|       't/005_pitr.pl', | ||||
|     ], | ||||
|   }, | ||||
| } | ||||
|   | ||||
							
								
								
									
										82
									
								
								contrib/amcheck/t/005_pitr.pl
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										82
									
								
								contrib/amcheck/t/005_pitr.pl
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,82 @@ | ||||
| # Copyright (c) 2021-2023, PostgreSQL Global Development Group | ||||
|  | ||||
| # Test integrity of intermediate states by PITR to those states | ||||
| use strict; | ||||
| use warnings; | ||||
| use PostgreSQL::Test::Cluster; | ||||
| use PostgreSQL::Test::Utils; | ||||
| use Test::More; | ||||
|  | ||||
| # origin node: generate WAL records of interest. | ||||
| my $origin = PostgreSQL::Test::Cluster->new('origin'); | ||||
| $origin->init(has_archiving => 1, allows_streaming => 1); | ||||
| $origin->append_conf('postgresql.conf', 'autovacuum = off'); | ||||
| $origin->start; | ||||
| $origin->backup('my_backup'); | ||||
| # Create a table with each of 6 PK values spanning 1/4 of a block.  Delete the | ||||
| # first four, so one index leaf is eligible for deletion.  Make a replication | ||||
| # slot just so pg_walinspect will always have access to later WAL. | ||||
| my $setup = <<EOSQL; | ||||
| BEGIN; | ||||
| CREATE EXTENSION amcheck; | ||||
| CREATE EXTENSION pg_walinspect; | ||||
| CREATE TABLE not_leftmost (c text STORAGE PLAIN); | ||||
| INSERT INTO not_leftmost | ||||
|   SELECT repeat(n::text, database_block_size / 4) | ||||
|   FROM generate_series(1,6) t(n), pg_control_init(); | ||||
| ALTER TABLE not_leftmost ADD CONSTRAINT not_leftmost_pk PRIMARY KEY (c); | ||||
| DELETE FROM not_leftmost WHERE c ~ '^[1-4]'; | ||||
| SELECT pg_create_physical_replication_slot('for_walinspect', true, false); | ||||
| COMMIT; | ||||
| EOSQL | ||||
| $origin->safe_psql('postgres', $setup); | ||||
| my $before_vacuum_lsn = | ||||
|   $origin->safe_psql('postgres', "SELECT pg_current_wal_lsn()"); | ||||
| # VACUUM to delete the aforementioned leaf page.  Force an XLogFlush() by | ||||
| # dropping a permanent table.  That way, the XLogReader infrastructure can | ||||
| # always see VACUUM's records, even under synchronous_commit=off.  Finally, | ||||
| # find the LSN of that VACUUM's last UNLINK_PAGE record. | ||||
| my $vacuum = <<EOSQL; | ||||
| SET synchronous_commit = off; | ||||
| VACUUM (VERBOSE, INDEX_CLEANUP ON) not_leftmost; | ||||
| CREATE TABLE XLogFlush (); | ||||
| DROP TABLE XLogFlush; | ||||
| SELECT max(start_lsn) | ||||
|   FROM pg_get_wal_records_info('$before_vacuum_lsn', 'FFFFFFFF/FFFFFFFF') | ||||
|   WHERE resource_manager = 'Btree' AND record_type = 'UNLINK_PAGE'; | ||||
| EOSQL | ||||
| my $unlink_lsn = $origin->safe_psql('postgres', $vacuum); | ||||
| $origin->stop; | ||||
| die "did not find UNLINK_PAGE record" unless $unlink_lsn; | ||||
|  | ||||
| # replica node: amcheck at notable points in the WAL stream | ||||
| my $replica = PostgreSQL::Test::Cluster->new('replica'); | ||||
| $replica->init_from_backup($origin, 'my_backup', has_restoring => 1); | ||||
| $replica->append_conf('postgresql.conf', | ||||
| 	"recovery_target_lsn = '$unlink_lsn'"); | ||||
| $replica->append_conf('postgresql.conf', 'recovery_target_inclusive = off'); | ||||
| $replica->append_conf('postgresql.conf', 'recovery_target_action = promote'); | ||||
| $replica->start; | ||||
| $replica->poll_query_until('postgres', "SELECT pg_is_in_recovery() = 'f';") | ||||
|   or die "Timed out while waiting for PITR promotion"; | ||||
| # recovery done; run amcheck | ||||
| my $debug = "SET client_min_messages = 'debug1'"; | ||||
| my ($rc, $stderr); | ||||
| $rc = $replica->psql( | ||||
| 	'postgres', | ||||
| 	"$debug; SELECT bt_index_parent_check('not_leftmost_pk', true)", | ||||
| 	stderr => \$stderr); | ||||
| print STDERR $stderr, "\n"; | ||||
| is($rc, 0, "bt_index_parent_check passes"); | ||||
| like( | ||||
| 	$stderr, | ||||
| 	qr/interrupted page deletion detected/, | ||||
| 	"bt_index_parent_check: interrupted page deletion detected"); | ||||
| $rc = $replica->psql( | ||||
| 	'postgres', | ||||
| 	"$debug; SELECT bt_index_check('not_leftmost_pk', true)", | ||||
| 	stderr => \$stderr); | ||||
| print STDERR $stderr, "\n"; | ||||
| is($rc, 0, "bt_index_check passes"); | ||||
|  | ||||
| done_testing(); | ||||
| @@ -148,6 +148,9 @@ static void bt_check_every_level(Relation rel, Relation heaprel, | ||||
| 								 bool rootdescend); | ||||
| static BtreeLevel bt_check_level_from_leftmost(BtreeCheckState *state, | ||||
| 											   BtreeLevel level); | ||||
| static bool bt_leftmost_ignoring_half_dead(BtreeCheckState *state, | ||||
| 										   BlockNumber start, | ||||
| 										   BTPageOpaque start_opaque); | ||||
| static void bt_recheck_sibling_links(BtreeCheckState *state, | ||||
| 									 BlockNumber btpo_prev_from_target, | ||||
| 									 BlockNumber leftcurrent); | ||||
| @@ -776,7 +779,7 @@ bt_check_level_from_leftmost(BtreeCheckState *state, BtreeLevel level) | ||||
| 			 */ | ||||
| 			if (state->readonly) | ||||
| 			{ | ||||
| 				if (!P_LEFTMOST(opaque)) | ||||
| 				if (!bt_leftmost_ignoring_half_dead(state, current, opaque)) | ||||
| 					ereport(ERROR, | ||||
| 							(errcode(ERRCODE_INDEX_CORRUPTED), | ||||
| 							 errmsg("block %u is not leftmost in index \"%s\"", | ||||
| @@ -830,8 +833,16 @@ bt_check_level_from_leftmost(BtreeCheckState *state, BtreeLevel level) | ||||
| 			 */ | ||||
| 		} | ||||
|  | ||||
| 		/* Sibling links should be in mutual agreement */ | ||||
| 		if (opaque->btpo_prev != leftcurrent) | ||||
| 		/* | ||||
| 		 * Sibling links should be in mutual agreement.  There arises | ||||
| 		 * leftcurrent == P_NONE && btpo_prev != P_NONE when the left sibling | ||||
| 		 * of the parent's low-key downlink is half-dead.  (A half-dead page | ||||
| 		 * has no downlink from its parent.)  Under heavyweight locking, the | ||||
| 		 * last bt_leftmost_ignoring_half_dead() validated this btpo_prev. | ||||
| 		 * Without heavyweight locking, validation of the P_NONE case remains | ||||
| 		 * unimplemented. | ||||
| 		 */ | ||||
| 		if (opaque->btpo_prev != leftcurrent && leftcurrent != P_NONE) | ||||
| 			bt_recheck_sibling_links(state, opaque->btpo_prev, leftcurrent); | ||||
|  | ||||
| 		/* Check level */ | ||||
| @@ -912,6 +923,66 @@ nextpage: | ||||
| 	return nextleveldown; | ||||
| } | ||||
|  | ||||
| /* | ||||
|  * Like P_LEFTMOST(start_opaque), but accept an arbitrarily-long chain of | ||||
|  * half-dead, sibling-linked pages to the left.  If a half-dead page appears | ||||
|  * under state->readonly, the database exited recovery between the first-stage | ||||
|  * and second-stage WAL records of a deletion. | ||||
|  */ | ||||
| static bool | ||||
| bt_leftmost_ignoring_half_dead(BtreeCheckState *state, | ||||
| 							   BlockNumber start, | ||||
| 							   BTPageOpaque start_opaque) | ||||
| { | ||||
| 	BlockNumber reached = start_opaque->btpo_prev, | ||||
| 				reached_from = start; | ||||
| 	bool		all_half_dead = true; | ||||
|  | ||||
| 	/* | ||||
| 	 * To handle the !readonly case, we'd need to accept BTP_DELETED pages and | ||||
| 	 * potentially observe nbtree/README "Page deletion and backwards scans". | ||||
| 	 */ | ||||
| 	Assert(state->readonly); | ||||
|  | ||||
| 	while (reached != P_NONE && all_half_dead) | ||||
| 	{ | ||||
| 		Page		page = palloc_btree_page(state, reached); | ||||
| 		BTPageOpaque reached_opaque = BTPageGetOpaque(page); | ||||
|  | ||||
| 		CHECK_FOR_INTERRUPTS(); | ||||
|  | ||||
| 		/* | ||||
| 		 * Try to detect btpo_prev circular links.  _bt_unlink_halfdead_page() | ||||
| 		 * writes that side-links will continue to point to the siblings. | ||||
| 		 * Check btpo_next for that property. | ||||
| 		 */ | ||||
| 		all_half_dead = P_ISHALFDEAD(reached_opaque) && | ||||
| 			reached != start && | ||||
| 			reached != reached_from && | ||||
| 			reached_opaque->btpo_next == reached_from; | ||||
| 		if (all_half_dead) | ||||
| 		{ | ||||
| 			XLogRecPtr	pagelsn = PageGetLSN(page); | ||||
|  | ||||
| 			/* pagelsn should point to an XLOG_BTREE_MARK_PAGE_HALFDEAD */ | ||||
| 			ereport(DEBUG1, | ||||
| 					(errcode(ERRCODE_NO_DATA), | ||||
| 					 errmsg_internal("harmless interrupted page deletion detected in index \"%s\"", | ||||
| 									 RelationGetRelationName(state->rel)), | ||||
| 					 errdetail_internal("Block=%u right block=%u page lsn=%X/%X.", | ||||
| 										reached, reached_from, | ||||
| 										LSN_FORMAT_ARGS(pagelsn)))); | ||||
|  | ||||
| 			reached_from = reached; | ||||
| 			reached = reached_opaque->btpo_prev; | ||||
| 		} | ||||
|  | ||||
| 		pfree(page); | ||||
| 	} | ||||
|  | ||||
| 	return all_half_dead; | ||||
| } | ||||
|  | ||||
| /* | ||||
|  * Raise an error when target page's left link does not point back to the | ||||
|  * previous target page, called leftcurrent here.  The leftcurrent page's | ||||
| @@ -952,6 +1023,9 @@ bt_recheck_sibling_links(BtreeCheckState *state, | ||||
| 						 BlockNumber btpo_prev_from_target, | ||||
| 						 BlockNumber leftcurrent) | ||||
| { | ||||
| 	/* passing metapage to BTPageGetOpaque() would give irrelevant findings */ | ||||
| 	Assert(leftcurrent != P_NONE); | ||||
|  | ||||
| 	if (!state->readonly) | ||||
| 	{ | ||||
| 		Buffer		lbuf; | ||||
| @@ -1935,7 +2009,8 @@ bt_child_highkey_check(BtreeCheckState *state, | ||||
| 		opaque = BTPageGetOpaque(page); | ||||
|  | ||||
| 		/* The first page we visit at the level should be leftmost */ | ||||
| 		if (first && !BlockNumberIsValid(state->prevrightlink) && !P_LEFTMOST(opaque)) | ||||
| 		if (first && !BlockNumberIsValid(state->prevrightlink) && | ||||
| 			!bt_leftmost_ignoring_half_dead(state, blkno, opaque)) | ||||
| 			ereport(ERROR, | ||||
| 					(errcode(ERRCODE_INDEX_CORRUPTED), | ||||
| 					 errmsg("the first child of leftmost target page is not leftmost of its level in index \"%s\"", | ||||
|   | ||||
		Reference in New Issue
	
	Block a user