1
0
mirror of https://github.com/postgres/postgres.git synced 2025-05-08 07:21:33 +03:00

Do not summarize WAL if generated with wal_level=minimal.

To do this, we must include the wal_level in the first WAL record
covered by each summary file; so add wal_level to struct Checkpoint
and the payload of XLOG_CHECKPOINT_REDO and XLOG_END_OF_RECOVERY.

This, in turn, requires bumping XLOG_PAGE_MAGIC and, since the
Checkpoint is also stored in the control file, also
PG_CONTROL_VERSION. It's not great to do that so late in the release
cycle, but the alternative seems to ship v17 without robust
protections against this scenario, which could result in corrupted
incremental backups.

A side effect of this patch is that, when a server with
wal_level=replica is started with summarize_wal=on for the first time,
summarization will no longer begin with the oldest WAL that still
exists in pg_wal, but rather from the first checkpoint after that.
This change should be harmless, because a WAL summary for a partial
checkpoint cycle can never make an incremental backup possible when
it would otherwise not have been.

Report by Fujii Masao. Patch by me. Review and/or testing by Jakub
Wartak and Fujii Masao.

Discussion: http://postgr.es/m/6e30082e-041b-4e31-9633-95a66de76f5d@oss.nttdata.com
This commit is contained in:
Robert Haas 2024-07-18 12:09:48 -04:00
parent 4cbd8c6022
commit 2b5819e2b4
9 changed files with 278 additions and 81 deletions

View File

@ -4318,11 +4318,17 @@ restore_command = 'copy "C:\\server\\archivedir\\%f" "%p"' # Windows
<listitem> <listitem>
<para> <para>
Enables the WAL summarizer process. Note that WAL summarization can Enables the WAL summarizer process. Note that WAL summarization can
be enabled either on a primary or on a standby. WAL summarization be enabled either on a primary or on a standby. This parameter can only
cannot be enabled when <varname>wal_level</varname> is set to be set in the <filename>postgresql.conf</filename> file or on the server
<literal>minimal</literal>. This parameter can only be set in the command line. The default is <literal>off</literal>.
<filename>postgresql.conf</filename> file or on the server command line. </para>
The default is <literal>off</literal>. <para>
The server cannot be started with <literal>summarize_wal=on</literal>
if <literal>wal_level</literal> is set to <literal>minimal</literal>. If
<literal>summarize_wal=on</literal> is configured after server startup
while <literal>wal_level=minimal</literal>, the summarizer will run
but refuse to generate summary files for any WAL generated with
<literal>wal_level=minimal</literal>.
</para> </para>
</listitem> </listitem>
</varlistentry> </varlistentry>

View File

@ -27960,6 +27960,17 @@ SELECT currval(pg_get_serial_sequence('sometable', 'id'));
not running, it will be equal to <literal>summarized_lsn</literal>. not running, it will be equal to <literal>summarized_lsn</literal>.
<literal>summarizer_pid</literal> is the PID of the WAL summarizer <literal>summarizer_pid</literal> is the PID of the WAL summarizer
process, if it is running, and otherwise NULL. process, if it is running, and otherwise NULL.
</para>
<para>
As a special exception, the WAL summarizer will refuse to generate
WAL summary files if run on WAL generated under
<literal>wal_level=minimal</literal>, since such summaries would be
unsafe to use as the basis for an incremental backup. In this case,
the fields above will continue to advance as if summaries were being
generated, but nothing will be written to disk. Once the summarizer
reaches WAL generated while <literal>wal_level</literal> was set
to <literal>replica</literal> or higher, it will resume writing
summaries to disk.
</para></entry> </para></entry>
</row> </row>
</tbody> </tbody>

View File

@ -33,6 +33,27 @@ const struct config_enum_entry wal_level_options[] = {
{NULL, 0, false} {NULL, 0, false}
}; };
/*
* Find a string representation for wal_level
*/
static const char *
get_wal_level_string(int wal_level)
{
const struct config_enum_entry *entry;
const char *wal_level_str = "?";
for (entry = wal_level_options; entry->name; entry++)
{
if (entry->val == wal_level)
{
wal_level_str = entry->name;
break;
}
}
return wal_level_str;
}
void void
xlog_desc(StringInfo buf, XLogReaderState *record) xlog_desc(StringInfo buf, XLogReaderState *record)
{ {
@ -45,7 +66,7 @@ xlog_desc(StringInfo buf, XLogReaderState *record)
CheckPoint *checkpoint = (CheckPoint *) rec; CheckPoint *checkpoint = (CheckPoint *) rec;
appendStringInfo(buf, "redo %X/%X; " appendStringInfo(buf, "redo %X/%X; "
"tli %u; prev tli %u; fpw %s; xid %u:%u; oid %u; multi %u; offset %u; " "tli %u; prev tli %u; fpw %s; wal_level %s; xid %u:%u; oid %u; multi %u; offset %u; "
"oldest xid %u in DB %u; oldest multi %u in DB %u; " "oldest xid %u in DB %u; oldest multi %u in DB %u; "
"oldest/newest commit timestamp xid: %u/%u; " "oldest/newest commit timestamp xid: %u/%u; "
"oldest running xid %u; %s", "oldest running xid %u; %s",
@ -53,6 +74,7 @@ xlog_desc(StringInfo buf, XLogReaderState *record)
checkpoint->ThisTimeLineID, checkpoint->ThisTimeLineID,
checkpoint->PrevTimeLineID, checkpoint->PrevTimeLineID,
checkpoint->fullPageWrites ? "true" : "false", checkpoint->fullPageWrites ? "true" : "false",
get_wal_level_string(checkpoint->wal_level),
EpochFromFullTransactionId(checkpoint->nextXid), EpochFromFullTransactionId(checkpoint->nextXid),
XidFromFullTransactionId(checkpoint->nextXid), XidFromFullTransactionId(checkpoint->nextXid),
checkpoint->nextOid, checkpoint->nextOid,
@ -95,20 +117,9 @@ xlog_desc(StringInfo buf, XLogReaderState *record)
{ {
xl_parameter_change xlrec; xl_parameter_change xlrec;
const char *wal_level_str; const char *wal_level_str;
const struct config_enum_entry *entry;
memcpy(&xlrec, rec, sizeof(xl_parameter_change)); memcpy(&xlrec, rec, sizeof(xl_parameter_change));
wal_level_str = get_wal_level_string(xlrec.wal_level);
/* Find a string representation for wal_level */
wal_level_str = "?";
for (entry = wal_level_options; entry->name; entry++)
{
if (entry->val == xlrec.wal_level)
{
wal_level_str = entry->name;
break;
}
}
appendStringInfo(buf, "max_connections=%d max_worker_processes=%d " appendStringInfo(buf, "max_connections=%d max_worker_processes=%d "
"max_wal_senders=%d max_prepared_xacts=%d " "max_wal_senders=%d max_prepared_xacts=%d "
@ -135,9 +146,10 @@ xlog_desc(StringInfo buf, XLogReaderState *record)
xl_end_of_recovery xlrec; xl_end_of_recovery xlrec;
memcpy(&xlrec, rec, sizeof(xl_end_of_recovery)); memcpy(&xlrec, rec, sizeof(xl_end_of_recovery));
appendStringInfo(buf, "tli %u; prev tli %u; time %s", appendStringInfo(buf, "tli %u; prev tli %u; time %s; wal_level %s",
xlrec.ThisTimeLineID, xlrec.PrevTimeLineID, xlrec.ThisTimeLineID, xlrec.PrevTimeLineID,
timestamptz_to_str(xlrec.end_time)); timestamptz_to_str(xlrec.end_time),
get_wal_level_string(xlrec.wal_level));
} }
else if (info == XLOG_OVERWRITE_CONTRECORD) else if (info == XLOG_OVERWRITE_CONTRECORD)
{ {
@ -150,7 +162,10 @@ xlog_desc(StringInfo buf, XLogReaderState *record)
} }
else if (info == XLOG_CHECKPOINT_REDO) else if (info == XLOG_CHECKPOINT_REDO)
{ {
/* No details to write out */ int wal_level;
memcpy(&wal_level, rec, sizeof(int));
appendStringInfo(buf, "wal_level %s", get_wal_level_string(wal_level));
} }
} }

View File

@ -6934,6 +6934,7 @@ CreateCheckPoint(int flags)
WALInsertLockAcquireExclusive(); WALInsertLockAcquireExclusive();
checkPoint.fullPageWrites = Insert->fullPageWrites; checkPoint.fullPageWrites = Insert->fullPageWrites;
checkPoint.wal_level = wal_level;
if (shutdown) if (shutdown)
{ {
@ -6987,11 +6988,9 @@ CreateCheckPoint(int flags)
*/ */
if (!shutdown) if (!shutdown)
{ {
int dummy = 0; /* Include WAL level in record for WAL summarizer's benefit. */
/* Record must have payload to avoid assertion failure. */
XLogBeginInsert(); XLogBeginInsert();
XLogRegisterData((char *) &dummy, sizeof(dummy)); XLogRegisterData((char *) &wal_level, sizeof(wal_level));
(void) XLogInsert(RM_XLOG_ID, XLOG_CHECKPOINT_REDO); (void) XLogInsert(RM_XLOG_ID, XLOG_CHECKPOINT_REDO);
/* /*
@ -7314,6 +7313,7 @@ CreateEndOfRecoveryRecord(void)
elog(ERROR, "can only be used to end recovery"); elog(ERROR, "can only be used to end recovery");
xlrec.end_time = GetCurrentTimestamp(); xlrec.end_time = GetCurrentTimestamp();
xlrec.wal_level = wal_level;
WALInsertLockAcquireExclusive(); WALInsertLockAcquireExclusive();
xlrec.ThisTimeLineID = XLogCtl->InsertTimeLineID; xlrec.ThisTimeLineID = XLogCtl->InsertTimeLineID;

View File

@ -154,7 +154,8 @@ static void SummarizeSmgrRecord(XLogReaderState *xlogreader,
BlockRefTable *brtab); BlockRefTable *brtab);
static void SummarizeXactRecord(XLogReaderState *xlogreader, static void SummarizeXactRecord(XLogReaderState *xlogreader,
BlockRefTable *brtab); BlockRefTable *brtab);
static bool SummarizeXlogRecord(XLogReaderState *xlogreader); static bool SummarizeXlogRecord(XLogReaderState *xlogreader,
bool *new_fast_forward);
static int summarizer_read_local_xlog_page(XLogReaderState *state, static int summarizer_read_local_xlog_page(XLogReaderState *state,
XLogRecPtr targetPagePtr, XLogRecPtr targetPagePtr,
int reqLen, int reqLen,
@ -802,6 +803,7 @@ SummarizeWAL(TimeLineID tli, XLogRecPtr start_lsn, bool exact,
char final_path[MAXPGPATH]; char final_path[MAXPGPATH];
WalSummaryIO io; WalSummaryIO io;
BlockRefTable *brtab = CreateEmptyBlockRefTable(); BlockRefTable *brtab = CreateEmptyBlockRefTable();
bool fast_forward = true;
/* Initialize private data for xlogreader. */ /* Initialize private data for xlogreader. */
private_data = (SummarizerReadLocalXLogPrivate *) private_data = (SummarizerReadLocalXLogPrivate *)
@ -900,7 +902,7 @@ SummarizeWAL(TimeLineID tli, XLogRecPtr start_lsn, bool exact,
int block_id; int block_id;
char *errormsg; char *errormsg;
XLogRecord *record; XLogRecord *record;
bool stop_requested = false; uint8 rmid;
HandleWalSummarizerInterrupts(); HandleWalSummarizerInterrupts();
@ -969,56 +971,86 @@ SummarizeWAL(TimeLineID tli, XLogRecPtr start_lsn, bool exact,
break; break;
} }
/* Special handling for particular types of WAL records. */ /*
switch (XLogRecGetRmid(xlogreader)) * Certain types of records require special handling. Redo points and
* shutdown checkpoints trigger creation of new summary files and can
* also cause us to enter or exit "fast forward" mode. Other types of
* records can require special updates to the block reference table.
*/
rmid = XLogRecGetRmid(xlogreader);
if (rmid == RM_XLOG_ID)
{ {
case RM_DBASE_ID: bool new_fast_forward;
SummarizeDbaseRecord(xlogreader, brtab);
break; /*
case RM_SMGR_ID: * If we've already processed some WAL records when we hit a redo
SummarizeSmgrRecord(xlogreader, brtab); * point or shutdown checkpoint, then we stop summarization before
break; * including this record in the current file, so that it will be
case RM_XACT_ID: * the first record in the next file.
SummarizeXactRecord(xlogreader, brtab); *
break; * When we hit one of those record types as the first record in a
case RM_XLOG_ID: * file, we adjust our notion of whether we're fast-forwarding.
stop_requested = SummarizeXlogRecord(xlogreader); * Any WAL generated with wal_level=minimal must be skipped
break; * without actually generating any summary file, because an
default: * incremental backup that crosses such WAL would be unsafe.
break; */
if (SummarizeXlogRecord(xlogreader, &new_fast_forward))
{
if (xlogreader->ReadRecPtr > summary_start_lsn)
{
summary_end_lsn = xlogreader->ReadRecPtr;
break;
}
else
fast_forward = new_fast_forward;
}
}
else if (!fast_forward)
{
/*
* This switch handles record types that require extra updates to
* the contents of the block reference table.
*/
switch (rmid)
{
case RM_DBASE_ID:
SummarizeDbaseRecord(xlogreader, brtab);
break;
case RM_SMGR_ID:
SummarizeSmgrRecord(xlogreader, brtab);
break;
case RM_XACT_ID:
SummarizeXactRecord(xlogreader, brtab);
break;
}
} }
/* /*
* If we've been told that it's time to end this WAL summary file, do * If we're in fast-forward mode, we don't really need to do anything.
* so. As an exception, if there's nothing included in this WAL * Otherwise, feed block references from xlog record to block
* summary file yet, then stopping doesn't make any sense, and we * reference table.
* should wait until the next stop point instead.
*/ */
if (stop_requested && xlogreader->ReadRecPtr > summary_start_lsn) if (!fast_forward)
{ {
summary_end_lsn = xlogreader->ReadRecPtr; for (block_id = 0; block_id <= XLogRecMaxBlockId(xlogreader);
break; block_id++)
} {
RelFileLocator rlocator;
ForkNumber forknum;
BlockNumber blocknum;
/* Feed block references from xlog record to block reference table. */ if (!XLogRecGetBlockTagExtended(xlogreader, block_id, &rlocator,
for (block_id = 0; block_id <= XLogRecMaxBlockId(xlogreader); &forknum, &blocknum, NULL))
block_id++) continue;
{
RelFileLocator rlocator;
ForkNumber forknum;
BlockNumber blocknum;
if (!XLogRecGetBlockTagExtended(xlogreader, block_id, &rlocator, /*
&forknum, &blocknum, NULL)) * As we do elsewhere, ignore the FSM fork, because it's not
continue; * fully WAL-logged.
*/
/* if (forknum != FSM_FORKNUM)
* As we do elsewhere, ignore the FSM fork, because it's not fully BlockRefTableMarkBlockModified(brtab, &rlocator, forknum,
* WAL-logged. blocknum);
*/ }
if (forknum != FSM_FORKNUM)
BlockRefTableMarkBlockModified(brtab, &rlocator, forknum,
blocknum);
} }
/* Update our notion of where this summary file ends. */ /* Update our notion of where this summary file ends. */
@ -1047,9 +1079,10 @@ SummarizeWAL(TimeLineID tli, XLogRecPtr start_lsn, bool exact,
/* /*
* If a timeline switch occurs, we may fail to make any progress at all * If a timeline switch occurs, we may fail to make any progress at all
* before exiting the loop above. If that happens, we don't write a WAL * before exiting the loop above. If that happens, we don't write a WAL
* summary file at all. * summary file at all. We can also skip writing a file if we're in
* fast-forward mode.
*/ */
if (summary_end_lsn > summary_start_lsn) if (summary_end_lsn > summary_start_lsn && !fast_forward)
{ {
/* Generate temporary and final path name. */ /* Generate temporary and final path name. */
snprintf(temp_path, MAXPGPATH, snprintf(temp_path, MAXPGPATH,
@ -1085,6 +1118,14 @@ SummarizeWAL(TimeLineID tli, XLogRecPtr start_lsn, bool exact,
durable_rename(temp_path, final_path, ERROR); durable_rename(temp_path, final_path, ERROR);
} }
/* If we skipped a non-zero amount of WAL, log a debug message. */
if (summary_end_lsn > summary_start_lsn && fast_forward)
ereport(DEBUG1,
errmsg("skipped summarizing WAL on TLI %u from %X/%X to %X/%X",
tli,
LSN_FORMAT_ARGS(summary_start_lsn),
LSN_FORMAT_ARGS(summary_end_lsn)));
return summary_end_lsn; return summary_end_lsn;
} }
@ -1263,22 +1304,70 @@ SummarizeXactRecord(XLogReaderState *xlogreader, BlockRefTable *brtab)
/* /*
* Special handling for WAL records with RM_XLOG_ID. * Special handling for WAL records with RM_XLOG_ID.
*
* The return value is true if WAL summarization should stop before this
* record and false otherwise. When the return value is true,
* *new_fast_forward indicates whether future processing should be done
* in fast forward mode (i.e. read WAL without emitting summaries) or not.
*/ */
static bool static bool
SummarizeXlogRecord(XLogReaderState *xlogreader) SummarizeXlogRecord(XLogReaderState *xlogreader, bool *new_fast_forward)
{ {
uint8 info = XLogRecGetInfo(xlogreader) & ~XLR_INFO_MASK; uint8 info = XLogRecGetInfo(xlogreader) & ~XLR_INFO_MASK;
int record_wal_level;
if (info == XLOG_CHECKPOINT_REDO || info == XLOG_CHECKPOINT_SHUTDOWN) if (info == XLOG_CHECKPOINT_REDO)
{ {
/* /* Payload is wal_level at the time record was written. */
* This is an LSN at which redo might begin, so we'd like memcpy(&record_wal_level, XLogRecGetData(xlogreader), sizeof(int));
* summarization to stop just before this WAL record. }
*/ else if (info == XLOG_CHECKPOINT_SHUTDOWN)
return true; {
CheckPoint rec_ckpt;
/* Extract wal_level at time record was written from payload. */
memcpy(&rec_ckpt, XLogRecGetData(xlogreader), sizeof(CheckPoint));
record_wal_level = rec_ckpt.wal_level;
}
else if (info == XLOG_PARAMETER_CHANGE)
{
xl_parameter_change xlrec;
/* Extract wal_level at time record was written from payload. */
memcpy(&xlrec, XLogRecGetData(xlogreader),
sizeof(xl_parameter_change));
record_wal_level = xlrec.wal_level;
}
else if (info == XLOG_END_OF_RECOVERY)
{
xl_end_of_recovery xlrec;
/* Extract wal_level at time record was written from payload. */
memcpy(&xlrec, XLogRecGetData(xlogreader), sizeof(xl_end_of_recovery));
record_wal_level = xlrec.wal_level;
}
else
{
/* No special handling required. Return false. */
return false;
} }
return false; /*
* Redo can only begin at an XLOG_CHECKPOINT_REDO or
* XLOG_CHECKPOINT_SHUTDOWN record, so we want WAL summarization to begin
* at those points. Hence, when those records are encountered, return
* true, so that we stop just before summarizing either of those records.
*
* We also reach here if we just saw XLOG_END_OF_RECOVERY or
* XLOG_PARAMETER_CHANGE. These are not places where recovery can start,
* but they're still relevant here. A new timeline can begin with
* XLOG_END_OF_RECOVERY, so we need to confirm the WAL level at that
* point; and a restart can provoke XLOG_PARAMETER_CHANGE after an
* intervening change to postgresql.conf, which might force us to stop
* summarizing.
*/
*new_fast_forward = (record_wal_level == WAL_LEVEL_MINIMAL);
return true;
} }
/* /*

View File

@ -34,6 +34,7 @@ tests += {
't/004_manifest.pl', 't/004_manifest.pl',
't/005_integrity.pl', 't/005_integrity.pl',
't/006_db_file_copy.pl', 't/006_db_file_copy.pl',
't/007_wal_level_minimal.pl',
], ],
} }
} }

View File

@ -0,0 +1,73 @@
# Copyright (c) 2021-2024, PostgreSQL Global Development Group
#
# This test aims to validate that taking an incremental backup fails when
# wal_level has been changed to minimal between the full backup and the
# attempted incremental backup.
use strict;
use warnings FATAL => 'all';
use File::Compare;
use PostgreSQL::Test::Cluster;
use PostgreSQL::Test::Utils;
use Test::More;
# Can be changed to test the other modes.
my $mode = $ENV{PG_TEST_PG_COMBINEBACKUP_MODE} || '--copy';
note "testing using mode $mode";
# Set up a new database instance.
my $node1 = PostgreSQL::Test::Cluster->new('node1');
$node1->init(allows_streaming => 1);
$node1->append_conf('postgresql.conf', <<EOM);
summarize_wal = on
wal_keep_size = '1GB'
EOM
$node1->start;
# Create a table and insert a test row into it.
$node1->safe_psql('postgres', <<EOM);
CREATE TABLE mytable (a int, b text);
INSERT INTO mytable VALUES (1, 'finch');
EOM
# Take a full backup.
my $backup1path = $node1->backup_dir . '/backup1';
$node1->command_ok(
[ 'pg_basebackup', '-D', $backup1path, '--no-sync', '-cfast' ],
"full backup");
# Switch to wal_level=minimal, which also requires max_wal_senders=0 and
# summarize_wal=off
$node1->safe_psql('postgres', <<EOM);
ALTER SYSTEM SET wal_level = minimal;
ALTER SYSTEM SET max_wal_senders = 0;
ALTER SYSTEM SET summarize_wal = off;
EOM
$node1->restart;
# Insert a second row on the original node.
$node1->safe_psql('postgres', <<EOM);
INSERT INTO mytable VALUES (2, 'gerbil');
EOM
# Revert configuration changes
$node1->safe_psql('postgres', <<EOM);
ALTER SYSTEM RESET wal_level;
ALTER SYSTEM RESET max_wal_senders;
ALTER SYSTEM RESET summarize_wal;
EOM
$node1->restart;
# Now take an incremental backup.
my $backup2path = $node1->backup_dir . '/backup2';
$node1->command_fails_like(
[
'pg_basebackup', '-D', $backup2path, '--no-sync', '-cfast',
'--incremental', $backup1path . '/backup_manifest'
],
qr/WAL summaries are required on timeline 1 from.*are incomplete/,
"incremental backup fails");
# OK, that's all.
done_testing();

View File

@ -31,7 +31,7 @@
/* /*
* Each page of XLOG file has a header like this: * Each page of XLOG file has a header like this:
*/ */
#define XLOG_PAGE_MAGIC 0xD115 /* can be used as WAL version indicator */ #define XLOG_PAGE_MAGIC 0xD116 /* can be used as WAL version indicator */
typedef struct XLogPageHeaderData typedef struct XLogPageHeaderData
{ {
@ -302,6 +302,7 @@ typedef struct xl_end_of_recovery
TimestampTz end_time; TimestampTz end_time;
TimeLineID ThisTimeLineID; /* new TLI */ TimeLineID ThisTimeLineID; /* new TLI */
TimeLineID PrevTimeLineID; /* previous TLI we forked off from */ TimeLineID PrevTimeLineID; /* previous TLI we forked off from */
int wal_level;
} xl_end_of_recovery; } xl_end_of_recovery;
/* /*

View File

@ -22,7 +22,7 @@
/* Version identifier for this pg_control format */ /* Version identifier for this pg_control format */
#define PG_CONTROL_VERSION 1300 #define PG_CONTROL_VERSION 1700
/* Nonce key length, see below */ /* Nonce key length, see below */
#define MOCK_AUTH_NONCE_LEN 32 #define MOCK_AUTH_NONCE_LEN 32
@ -40,6 +40,7 @@ typedef struct CheckPoint
TimeLineID PrevTimeLineID; /* previous TLI, if this record begins a new TimeLineID PrevTimeLineID; /* previous TLI, if this record begins a new
* timeline (equals ThisTimeLineID otherwise) */ * timeline (equals ThisTimeLineID otherwise) */
bool fullPageWrites; /* current full_page_writes */ bool fullPageWrites; /* current full_page_writes */
int wal_level; /* current wal_level */
FullTransactionId nextXid; /* next free transaction ID */ FullTransactionId nextXid; /* next free transaction ID */
Oid nextOid; /* next free OID */ Oid nextOid; /* next free OID */
MultiXactId nextMulti; /* next free MultiXactId */ MultiXactId nextMulti; /* next free MultiXactId */