1
0
mirror of https://github.com/postgres/postgres.git synced 2025-06-27 23:21:58 +03:00

Add TAP tests to check replication slot advance during the checkpoint

The new tests verify that logical and physical replication slots are still
valid after an immediate restart on checkpoint completion when the slot was
advanced during the checkpoint.

This commit introduces two new injection points to make these tests possible:

* checkpoint-before-old-wal-removal - triggered in the checkpointer process
  just before old WAL segments cleanup;
* logical-replication-slot-advance-segment - triggered in
  LogicalConfirmReceivedLocation() when restart_lsn was changed enough to
  point to the next WAL segment.

Discussion: https://postgr.es/m/flat/1d12d2-67235980-35-19a406a0%4063439497
Author: Vitaly Davydov <v.davydov@postgrespro.ru>
Author: Tomas Vondra <tomas@vondra.me>
Reviewed-by: Alexander Korotkov <aekorotkov@gmail.com>
Reviewed-by: Amit Kapila <amit.kapila16@gmail.com>
Backpatch-through: 17
This commit is contained in:
Alexander Korotkov
2025-06-14 03:35:27 +03:00
parent ca307d5cec
commit eb124c3d6d
5 changed files with 296 additions and 0 deletions

View File

@ -7498,6 +7498,10 @@ CreateCheckPoint(int flags)
if (PriorRedoPtr != InvalidXLogRecPtr)
UpdateCheckPointDistanceEstimate(RedoRecPtr - PriorRedoPtr);
#ifdef USE_INJECTION_POINTS
INJECTION_POINT("checkpoint-before-old-wal-removal", NULL);
#endif
/*
* Delete old log files, those no longer needed for last checkpoint to
* prevent the disk holding the xlog from growing full.

View File

@ -29,6 +29,7 @@
#include "postgres.h"
#include "access/xact.h"
#include "access/xlog_internal.h"
#include "access/xlogutils.h"
#include "fmgr.h"
#include "miscadmin.h"
@ -41,6 +42,7 @@
#include "storage/proc.h"
#include "storage/procarray.h"
#include "utils/builtins.h"
#include "utils/injection_point.h"
#include "utils/inval.h"
#include "utils/memutils.h"
@ -1825,9 +1827,13 @@ LogicalConfirmReceivedLocation(XLogRecPtr lsn)
{
bool updated_xmin = false;
bool updated_restart = false;
XLogRecPtr restart_lsn pg_attribute_unused();
SpinLockAcquire(&MyReplicationSlot->mutex);
/* remember the old restart lsn */
restart_lsn = MyReplicationSlot->data.restart_lsn;
/*
* Prevent moving the confirmed_flush backwards, as this could lead to
* data duplication issues caused by replicating already replicated
@ -1881,6 +1887,18 @@ LogicalConfirmReceivedLocation(XLogRecPtr lsn)
/* first write new xmin to disk, so we know what's up after a crash */
if (updated_xmin || updated_restart)
{
#ifdef USE_INJECTION_POINTS
XLogSegNo seg1,
seg2;
XLByteToSeg(restart_lsn, seg1, wal_segment_size);
XLByteToSeg(MyReplicationSlot->data.restart_lsn, seg2, wal_segment_size);
/* trigger injection point, but only if segment changes */
if (seg1 != seg2)
INJECTION_POINT("logical-replication-slot-advance-segment", NULL);
#endif
ReplicationSlotMarkDirty();
ReplicationSlotSave();
elog(DEBUG1, "updated xmin: %u restart: %u", updated_xmin, updated_restart);

View File

@ -54,6 +54,8 @@ tests += {
't/043_no_contrecord_switch.pl',
't/044_invalidate_inactive_slots.pl',
't/045_archive_restartpoint.pl',
't/046_checkpoint_logical_slot.pl',
't/047_checkpoint_physical_slot.pl'
],
},
}

View File

@ -0,0 +1,139 @@
# Copyright (c) 2025, PostgreSQL Global Development Group
#
# This test verifies the case when the logical slot is advanced during
# checkpoint. The test checks that the logical slot's restart_lsn still refers
# to an existed WAL segment after immediate restart.
#
use strict;
use warnings FATAL => 'all';
use PostgreSQL::Test::Cluster;
use PostgreSQL::Test::Utils;
use Test::More;
if ($ENV{enable_injection_points} ne 'yes')
{
plan skip_all => 'Injection points not supported by this build';
}
my ($node, $result);
$node = PostgreSQL::Test::Cluster->new('mike');
$node->init;
$node->append_conf('postgresql.conf',
"shared_preload_libraries = 'injection_points'");
$node->append_conf('postgresql.conf', "wal_level = 'logical'");
$node->start;
$node->safe_psql('postgres', q(CREATE EXTENSION injection_points));
# Create a simple table to generate data into.
$node->safe_psql('postgres',
q{create table t (id serial primary key, b text)});
# Create the two slots we'll need.
$node->safe_psql('postgres',
q{select pg_create_logical_replication_slot('slot_logical', 'test_decoding')}
);
$node->safe_psql('postgres',
q{select pg_create_physical_replication_slot('slot_physical', true)});
# Advance both slots to the current position just to have everything "valid".
$node->safe_psql('postgres',
q{select count(*) from pg_logical_slot_get_changes('slot_logical', null, null)}
);
$node->safe_psql('postgres',
q{select pg_replication_slot_advance('slot_physical', pg_current_wal_lsn())}
);
# Run checkpoint to flush current state to disk and set a baseline.
$node->safe_psql('postgres', q{checkpoint});
# Generate some transactions to get RUNNING_XACTS.
my $xacts = $node->background_psql('postgres');
$xacts->query_until(
qr/run_xacts/,
q(\echo run_xacts
SELECT 1 \watch 0.1
\q
));
# Insert 2M rows; that's about 260MB (~20 segments) worth of WAL.
$node->safe_psql('postgres',
q{insert into t (b) select md5(i::text) from generate_series(1,1000000) s(i)}
);
# Run another checkpoint to set a new restore LSN.
$node->safe_psql('postgres', q{checkpoint});
# Another 2M rows; that's about 260MB (~20 segments) worth of WAL.
$node->safe_psql('postgres',
q{insert into t (b) select md5(i::text) from generate_series(1,1000000) s(i)}
);
# Run another checkpoint, this time in the background, and make it wait
# on the injection point) so that the checkpoint stops right before
# removing old WAL segments.
note('starting checkpoint\n');
my $checkpoint = $node->background_psql('postgres');
$checkpoint->query_safe(
q(select injection_points_attach('checkpoint-before-old-wal-removal','wait'))
);
$checkpoint->query_until(
qr/starting_checkpoint/,
q(\echo starting_checkpoint
checkpoint;
\q
));
# Wait until the checkpoint stops right before removing WAL segments.
note('waiting for injection_point\n');
$node->wait_for_event('checkpointer', 'checkpoint-before-old-wal-removal');
note('injection_point is reached');
# Try to advance the logical slot, but make it stop when it moves to the next
# WAL segment (this has to happen in the background, too).
my $logical = $node->background_psql('postgres');
$logical->query_safe(
q{select injection_points_attach('logical-replication-slot-advance-segment','wait');}
);
$logical->query_until(
qr/get_changes/,
q(
\echo get_changes
select count(*) from pg_logical_slot_get_changes('slot_logical', null, null) \watch 1
\q
));
# Wait until the slot's restart_lsn points to the next WAL segment.
note('waiting for injection_point\n');
$node->wait_for_event('client backend',
'logical-replication-slot-advance-segment');
note('injection_point is reached');
# OK, we're in the right situation: time to advance the physical slot, which
# recalculates the required LSN, and then unblock the checkpoint, which
# removes the WAL still needed by the logical slot.
$node->safe_psql('postgres',
q{select pg_replication_slot_advance('slot_physical', pg_current_wal_lsn())}
);
# Continue the checkpoint.
$node->safe_psql('postgres',
q{select injection_points_wakeup('checkpoint-before-old-wal-removal')});
# Abruptly stop the server (1 second should be enough for the checkpoint
# to finish; it would be better).
$node->stop('immediate');
$node->start;
eval {
$node->safe_psql('postgres',
q{select count(*) from pg_logical_slot_get_changes('slot_logical', null, null);}
);
};
is($@, '', "Logical slot still valid");
done_testing();

View File

@ -0,0 +1,133 @@
# Copyright (c) 2025, PostgreSQL Global Development Group
#
# This test verifies the case when the physical slot is advanced during
# checkpoint. The test checks that the physical slot's restart_lsn still refers
# to an existed WAL segment after immediate restart.
#
use strict;
use warnings FATAL => 'all';
use PostgreSQL::Test::Cluster;
use PostgreSQL::Test::Utils;
use Test::More;
if ($ENV{enable_injection_points} ne 'yes')
{
plan skip_all => 'Injection points not supported by this build';
}
my ($node, $result);
$node = PostgreSQL::Test::Cluster->new('mike');
$node->init;
$node->append_conf('postgresql.conf',
"shared_preload_libraries = 'injection_points'");
$node->append_conf('postgresql.conf', "wal_level = 'replica'");
$node->start;
$node->safe_psql('postgres', q(CREATE EXTENSION injection_points));
# Create a simple table to generate data into.
$node->safe_psql('postgres',
q{create table t (id serial primary key, b text)});
# Create a physical replication slot.
$node->safe_psql('postgres',
q{select pg_create_physical_replication_slot('slot_physical', true)});
# Advance slot to the current position, just to have everything "valid".
$node->safe_psql('postgres',
q{select pg_replication_slot_advance('slot_physical', pg_current_wal_lsn())}
);
# Run checkpoint to flush current state to disk and set a baseline.
$node->safe_psql('postgres', q{checkpoint});
# Insert 2M rows; that's about 260MB (~20 segments) worth of WAL.
$node->safe_psql('postgres',
q{insert into t (b) select md5(i::text) from generate_series(1,100000) s(i)}
);
# Advance slot to the current position, just to have everything "valid".
$node->safe_psql('postgres',
q{select pg_replication_slot_advance('slot_physical', pg_current_wal_lsn())}
);
# Run another checkpoint to set a new restore LSN.
$node->safe_psql('postgres', q{checkpoint});
# Another 2M rows; that's about 260MB (~20 segments) worth of WAL.
$node->safe_psql('postgres',
q{insert into t (b) select md5(i::text) from generate_series(1,1000000) s(i)}
);
my $restart_lsn_init = $node->safe_psql('postgres',
q{select restart_lsn from pg_replication_slots where slot_name = 'slot_physical'}
);
chomp($restart_lsn_init);
note("restart lsn before checkpoint: $restart_lsn_init");
# Run another checkpoint, this time in the background, and make it wait
# on the injection point) so that the checkpoint stops right before
# removing old WAL segments.
note('starting checkpoint');
my $checkpoint = $node->background_psql('postgres');
$checkpoint->query_safe(
q{select injection_points_attach('checkpoint-before-old-wal-removal','wait')}
);
$checkpoint->query_until(
qr/starting_checkpoint/,
q(\echo starting_checkpoint
checkpoint;
\q
));
# Wait until the checkpoint stops right before removing WAL segments.
note('waiting for injection_point');
$node->wait_for_event('checkpointer', 'checkpoint-before-old-wal-removal');
note('injection_point is reached');
# OK, we're in the right situation: time to advance the physical slot, which
# recalculates the required LSN and then unblock the checkpoint, which
# removes the WAL still needed by the physical slot.
$node->safe_psql('postgres',
q{select pg_replication_slot_advance('slot_physical', pg_current_wal_lsn())}
);
# Continue the checkpoint.
$node->safe_psql('postgres',
q{select injection_points_wakeup('checkpoint-before-old-wal-removal')});
my $restart_lsn_old = $node->safe_psql('postgres',
q{select restart_lsn from pg_replication_slots where slot_name = 'slot_physical'}
);
chomp($restart_lsn_old);
note("restart lsn before stop: $restart_lsn_old");
# Abruptly stop the server (1 second should be enough for the checkpoint
# to finish; it would be better).
$node->stop('immediate');
$node->start;
# Get the restart_lsn of the slot right after restarting.
my $restart_lsn = $node->safe_psql('postgres',
q{select restart_lsn from pg_replication_slots where slot_name = 'slot_physical'}
);
chomp($restart_lsn);
note("restart lsn: $restart_lsn");
# Get the WAL segment name for the slot's restart_lsn.
my $restart_lsn_segment = $node->safe_psql('postgres',
"SELECT pg_walfile_name('$restart_lsn'::pg_lsn)");
chomp($restart_lsn_segment);
# Check if the required wal segment exists.
note("required by slot segment name: $restart_lsn_segment");
my $datadir = $node->data_dir;
ok( -f "$datadir/pg_wal/$restart_lsn_segment",
"WAL segment $restart_lsn_segment for physical slot's restart_lsn $restart_lsn exists"
);
done_testing();