1
0
mirror of https://github.com/postgres/postgres.git synced 2025-04-20 00:42:27 +03:00

Fix test races between syscache-update-pruned.spec and autovacuum.

This spec fails ~3% of my Valgrind runs, and the spec has failed on Valgrind
buildfarm member skink at a similar rate.  Two problems contributed to that:

- A competing buffer pin triggered VACUUM's lazy_scan_noprune() path, causing
  "tuples missed: 1 dead from 1 pages not removed due to cleanup lock
  contention".  FREEZE fixes that.

- The spec ran lazy VACUUM immediately after VACUUM FULL.  The spec implicitly
  assumed lazy VACUUM prunes the one tuple that VACUUM FULL made dead.  First
  wait for old snapshots, making that assumption reliable.

This also adds two forms of defense in depth:

- Wait for snapshots using shared catalog pruning rules (VISHORIZON_SHARED).
  This avoids the removable cutoff moving backward when an XID-bearing
  autoanalyze process runs in another database.  That may never happen in this
  test, but it's cheap insurance.

- Use lazy VACUUM option DISABLE_PAGE_SKIPPING.  Commit
  c2dc1a79767a0f947e1145f82eb65dfe4360d25f did this for a related requirement
  in other tests, but I suspect FREEZE is necessary and sufficient in all
  these tests.

Back-patch to v17, where the test first appeared.

Reported-by: Andres Freund <andres@anarazel.de>
Discussion: https://postgr.es/m/sv3taq4e6ea4qckimien3nxp3sz4b6cw6sfcy4nhwl52zpur4g@h6i6tohxmizu
Backpatch-through: 17
This commit is contained in:
Noah Misch 2025-04-09 07:23:39 -07:00
parent 17a165d60f
commit 42bb4fb728
4 changed files with 41 additions and 16 deletions

View File

@ -7,7 +7,7 @@ step at2:
FOR EACH ROW EXECUTE PROCEDURE suppress_redundant_updates_trigger();
<waiting ...>
step waitprunable4: CALL vactest.wait_prunable();
step vac4: VACUUM pg_class;
step vac4: VACUUM (FREEZE, DISABLE_PAGE_SKIPPING) pg_class;
step grant1: GRANT SELECT ON vactest.orig50 TO PUBLIC; <waiting ...>
step wakeinval4:
SELECT FROM injection_points_detach('AtEOXact_Inval-with-transInvalInfo');
@ -30,7 +30,7 @@ step at2:
FOR EACH ROW EXECUTE PROCEDURE suppress_redundant_updates_trigger();
<waiting ...>
step waitprunable4: CALL vactest.wait_prunable();
step vac4: VACUUM pg_class;
step vac4: VACUUM (FREEZE, DISABLE_PAGE_SKIPPING) pg_class;
step grant1: GRANT SELECT ON vactest.orig50 TO PUBLIC; <waiting ...>
step wakeinval4:
SELECT FROM injection_points_detach('AtEOXact_Inval-with-transInvalInfo');
@ -61,7 +61,7 @@ step mkrels4:
step r3: ROLLBACK;
step waitprunable4: CALL vactest.wait_prunable();
step vac4: VACUUM pg_class;
step vac4: VACUUM (FREEZE, DISABLE_PAGE_SKIPPING) pg_class;
step grant1: GRANT SELECT ON vactest.orig50 TO PUBLIC; <waiting ...>
step wakeinval4:
SELECT FROM injection_points_detach('AtEOXact_Inval-with-transInvalInfo');

View File

@ -7,7 +7,7 @@ step at2:
FOR EACH ROW EXECUTE PROCEDURE suppress_redundant_updates_trigger();
<waiting ...>
step waitprunable4: CALL vactest.wait_prunable();
step vac4: VACUUM pg_class;
step vac4: VACUUM (FREEZE, DISABLE_PAGE_SKIPPING) pg_class;
step grant1: GRANT SELECT ON vactest.orig50 TO PUBLIC; <waiting ...>
step wakeinval4:
SELECT FROM injection_points_detach('AtEOXact_Inval-with-transInvalInfo');
@ -29,7 +29,7 @@ step at2:
FOR EACH ROW EXECUTE PROCEDURE suppress_redundant_updates_trigger();
<waiting ...>
step waitprunable4: CALL vactest.wait_prunable();
step vac4: VACUUM pg_class;
step vac4: VACUUM (FREEZE, DISABLE_PAGE_SKIPPING) pg_class;
step grant1: GRANT SELECT ON vactest.orig50 TO PUBLIC; <waiting ...>
step wakeinval4:
SELECT FROM injection_points_detach('AtEOXact_Inval-with-transInvalInfo');
@ -59,7 +59,7 @@ step mkrels4:
step r3: ROLLBACK;
step waitprunable4: CALL vactest.wait_prunable();
step vac4: VACUUM pg_class;
step vac4: VACUUM (FREEZE, DISABLE_PAGE_SKIPPING) pg_class;
step grant1: GRANT SELECT ON vactest.orig50 TO PUBLIC; <waiting ...>
step wakeinval4:
SELECT FROM injection_points_detach('AtEOXact_Inval-with-transInvalInfo');

View File

@ -17,7 +17,9 @@
#include "access/table.h"
#include "fmgr.h"
#include "miscadmin.h"
#include "postmaster/autovacuum.h"
#include "storage/procarray.h"
#include "utils/rel.h"
#include "utils/xid8.h"
/*
@ -28,11 +30,12 @@
* that. For the causes of backward movement, see
* postgr.es/m/CAEze2Wj%2BV0kTx86xB_YbyaqTr5hnE_igdWAwuhSyjXBYscf5-Q%40mail.gmail.com
* and the header comment for ComputeXidHorizons(). One can assume this
* doesn't move backward if one arranges for concurrent activity not to reach
* AbortTransaction() and not to allocate an XID while connected to another
* database. Non-runningcheck tests can control most concurrent activity,
* except autovacuum and the isolationtester control connection. Neither
* allocates XIDs, and AbortTransaction() in those would justify test failure.
* doesn't move backward if one (a) passes a shared catalog as the argument
* and (b) arranges for concurrent activity not to reach AbortTransaction().
* Non-runningcheck tests can control most concurrent activity, except
* autovacuum and the isolationtester control connection. AbortTransaction()
* in those would justify test failure. Seeing autoanalyze can allocate an
* XID in any database, (a) ensures we'll consistently not ignore those XIDs.
*/
PG_FUNCTION_INFO_V1(removable_cutoff);
Datum
@ -47,6 +50,10 @@ removable_cutoff(PG_FUNCTION_ARGS)
if (!PG_ARGISNULL(0))
rel = table_open(PG_GETARG_OID(0), AccessShareLock);
if (!rel->rd_rel->relisshared && autovacuum_start_daemon)
elog(WARNING,
"removable_cutoff(non-shared-rel) can move backward under autovacuum=on");
/*
* No lock or snapshot necessarily prevents oldestXid from advancing past
* "xid" while this function runs. That concerns us only in that we must

View File

@ -53,10 +53,12 @@ setup
barrier := pg_current_xact_id();
-- autovacuum worker RelationCacheInitializePhase3() or the
-- isolationtester control connection might hold a snapshot that
-- limits pruning. Sleep until that clears.
-- limits pruning. Sleep until that clears. See comments at
-- removable_cutoff() for why we pass a shared catalog rather than
-- pg_class, the table we'll prune.
LOOP
ROLLBACK; -- release MyProc->xmin, which could be the oldest
cutoff := removable_cutoff('pg_class');
cutoff := removable_cutoff('pg_database');
EXIT WHEN cutoff >= barrier;
RAISE LOG 'removable cutoff %; waiting for %', cutoff, barrier;
PERFORM pg_sleep(.1);
@ -64,9 +66,24 @@ setup
END
$$;
}
setup { CALL vactest.wait_prunable(); -- maximize next two VACUUMs }
# Eliminate HEAPTUPLE_DEAD and HEAPTUPLE_RECENTLY_DEAD from pg_class.
# Minimize free space.
#
# If we kept HEAPTUPLE_RECENTLY_DEAD, step vac4 could prune what we missed,
# breaking some permutation assumptions. Specifically, the next pg_class
# tuple could end up in free space we failed to liberate here, instead of
# going in the specific free space vac4 intended to liberate for it.
setup { CALL vactest.wait_prunable(); -- maximize VACUUM FULL }
setup { VACUUM FULL pg_class; -- reduce free space }
setup { VACUUM FREEZE pg_class; -- populate fsm etc. }
# Remove the one tuple that VACUUM FULL makes dead, a tuple pertaining to
# pg_class itself. Populate the FSM for pg_class.
#
# wait_prunable waits for snapshots that would thwart pruning, while FREEZE
# waits for buffer pins that would thwart pruning. DISABLE_PAGE_SKIPPING
# isn't actually needed, but other pruning-dependent tests use it. If those
# tests remove it, remove it here.
setup { CALL vactest.wait_prunable(); -- maximize lazy VACUUM }
setup { VACUUM (FREEZE, DISABLE_PAGE_SKIPPING) pg_class; -- fill fsm etc. }
setup
{
SELECT FROM vactest.mkrels('orig', 1, 49);
@ -115,7 +132,8 @@ step r3 { ROLLBACK; }
# Non-blocking actions.
session s4
step waitprunable4 { CALL vactest.wait_prunable(); }
step vac4 { VACUUM pg_class; }
# Eliminate HEAPTUPLE_DEAD. See above discussion of FREEZE.
step vac4 { VACUUM (FREEZE, DISABLE_PAGE_SKIPPING) pg_class; }
# Reuse the lp that s1 is waiting to change. I've observed reuse at the 1st
# or 18th CREATE, so create excess.
step mkrels4 {