mirror of
https://github.com/postgres/postgres.git
synced 2025-07-30 11:03:19 +03:00
Physical replication always ships WAL segment files to replicas once
they are complete. This is a problem if one WAL record is split across
a segment boundary and the primary server crashes before writing down
the segment with the next portion of the WAL record: WAL writing after
crash recovery would happily resume at the point where the broken record
started, overwriting that record ... but any standby or backup may have
already received a copy of that segment, and they are not rewinding.
This causes standbys to stop following the primary after the latter
crashes:
LOG: invalid contrecord length 7262 at A8/D9FFFBC8
because the standby is still trying to read the continuation record
(contrecord) for the original long WAL record, but it is not there and
it will never be. A workaround is to stop the replica, delete the WAL
file, and restart it -- at which point a fresh copy is brought over from
the primary. But that's pretty labor intensive, and I bet many users
would just give up and re-clone the standby instead.
A fix for this problem was already attempted in commit 515e3d84a0
, but
it only addressed the case for the scenario of WAL archiving, so
streaming replication would still be a problem (as well as other things
such as taking a filesystem-level backup while the server is down after
having crashed), and it had performance scalability problems too; so it
had to be reverted.
This commit fixes the problem using an approach suggested by Andres
Freund, whereby the initial portion(s) of the split-up WAL record are
kept, and a special type of WAL record is written where the contrecord
was lost, so that WAL replay in the replica knows to skip the broken
parts. With this approach, we can continue to stream/archive segment
files as soon as they are complete, and replay of the broken records
will proceed across the crash point without a hitch.
Because a new type of WAL record is added, users should be careful to
upgrade standbys first, primaries later. Otherwise they risk the standby
being unable to start if the primary happens to write such a record.
A new TAP test that exercises this is added, but the portability of it
is yet to be seen.
This has been wrong since the introduction of physical replication, so
backpatch all the way back. In stable branches, keep the new
XLogReaderState members at the end of the struct, to avoid an ABI
break.
Author: Álvaro Herrera <alvherre@alvh.no-ip.org>
Reviewed-by: Kyotaro Horiguchi <horikyota.ntt@gmail.com>
Reviewed-by: Nathan Bossart <bossartn@amazon.com>
Discussion: https://postgr.es/m/202108232252.dh7uxf6oxwcy@alvherre.pgsql
255 lines
9.2 KiB
C
255 lines
9.2 KiB
C
/*-------------------------------------------------------------------------
|
|
*
|
|
* pg_control.h
|
|
* The system control file "pg_control" is not a heap relation.
|
|
* However, we define it here so that the format is documented.
|
|
*
|
|
*
|
|
* Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
|
|
* Portions Copyright (c) 1994, Regents of the University of California
|
|
*
|
|
* src/include/catalog/pg_control.h
|
|
*
|
|
*-------------------------------------------------------------------------
|
|
*/
|
|
#ifndef PG_CONTROL_H
|
|
#define PG_CONTROL_H
|
|
|
|
#include "access/transam.h"
|
|
#include "access/xlogdefs.h"
|
|
#include "pgtime.h" /* for pg_time_t */
|
|
#include "port/pg_crc32c.h"
|
|
|
|
|
|
/* Version identifier for this pg_control format */
|
|
#define PG_CONTROL_VERSION 1201
|
|
|
|
/* Nonce key length, see below */
|
|
#define MOCK_AUTH_NONCE_LEN 32
|
|
|
|
/*
|
|
* Body of CheckPoint XLOG records. This is declared here because we keep
|
|
* a copy of the latest one in pg_control for possible disaster recovery.
|
|
* Changing this struct requires a PG_CONTROL_VERSION bump.
|
|
*/
|
|
typedef struct CheckPoint
|
|
{
|
|
XLogRecPtr redo; /* next RecPtr available when we began to
|
|
* create CheckPoint (i.e. REDO start point) */
|
|
TimeLineID ThisTimeLineID; /* current TLI */
|
|
TimeLineID PrevTimeLineID; /* previous TLI, if this record begins a new
|
|
* timeline (equals ThisTimeLineID otherwise) */
|
|
bool fullPageWrites; /* current full_page_writes */
|
|
FullTransactionId nextFullXid; /* next free full transaction ID */
|
|
Oid nextOid; /* next free OID */
|
|
MultiXactId nextMulti; /* next free MultiXactId */
|
|
MultiXactOffset nextMultiOffset; /* next free MultiXact offset */
|
|
TransactionId oldestXid; /* cluster-wide minimum datfrozenxid */
|
|
Oid oldestXidDB; /* database with minimum datfrozenxid */
|
|
MultiXactId oldestMulti; /* cluster-wide minimum datminmxid */
|
|
Oid oldestMultiDB; /* database with minimum datminmxid */
|
|
pg_time_t time; /* time stamp of checkpoint */
|
|
TransactionId oldestCommitTsXid; /* oldest Xid with valid commit
|
|
* timestamp */
|
|
TransactionId newestCommitTsXid; /* newest Xid with valid commit
|
|
* timestamp */
|
|
|
|
/*
|
|
* Oldest XID still running. This is only needed to initialize hot standby
|
|
* mode from an online checkpoint, so we only bother calculating this for
|
|
* online checkpoints and only when wal_level is replica. Otherwise it's
|
|
* set to InvalidTransactionId.
|
|
*/
|
|
TransactionId oldestActiveXid;
|
|
} CheckPoint;
|
|
|
|
/* XLOG info values for XLOG rmgr */
|
|
#define XLOG_CHECKPOINT_SHUTDOWN 0x00
|
|
#define XLOG_CHECKPOINT_ONLINE 0x10
|
|
#define XLOG_NOOP 0x20
|
|
#define XLOG_NEXTOID 0x30
|
|
#define XLOG_SWITCH 0x40
|
|
#define XLOG_BACKUP_END 0x50
|
|
#define XLOG_PARAMETER_CHANGE 0x60
|
|
#define XLOG_RESTORE_POINT 0x70
|
|
#define XLOG_FPW_CHANGE 0x80
|
|
#define XLOG_END_OF_RECOVERY 0x90
|
|
#define XLOG_FPI_FOR_HINT 0xA0
|
|
#define XLOG_FPI 0xB0
|
|
/* 0xC0 is used in Postgres 9.5-11 */
|
|
#define XLOG_OVERWRITE_CONTRECORD 0xD0
|
|
|
|
|
|
/*
|
|
* System status indicator. Note this is stored in pg_control; if you change
|
|
* it, you must bump PG_CONTROL_VERSION
|
|
*/
|
|
typedef enum DBState
|
|
{
|
|
DB_STARTUP = 0,
|
|
DB_SHUTDOWNED,
|
|
DB_SHUTDOWNED_IN_RECOVERY,
|
|
DB_SHUTDOWNING,
|
|
DB_IN_CRASH_RECOVERY,
|
|
DB_IN_ARCHIVE_RECOVERY,
|
|
DB_IN_PRODUCTION
|
|
} DBState;
|
|
|
|
/*
|
|
* Contents of pg_control.
|
|
*/
|
|
|
|
typedef struct ControlFileData
|
|
{
|
|
/*
|
|
* Unique system identifier --- to ensure we match up xlog files with the
|
|
* installation that produced them.
|
|
*/
|
|
uint64 system_identifier;
|
|
|
|
/*
|
|
* Version identifier information. Keep these fields at the same offset,
|
|
* especially pg_control_version; they won't be real useful if they move
|
|
* around. (For historical reasons they must be 8 bytes into the file
|
|
* rather than immediately at the front.)
|
|
*
|
|
* pg_control_version identifies the format of pg_control itself.
|
|
* catalog_version_no identifies the format of the system catalogs.
|
|
*
|
|
* There are additional version identifiers in individual files; for
|
|
* example, WAL logs contain per-page magic numbers that can serve as
|
|
* version cues for the WAL log.
|
|
*/
|
|
uint32 pg_control_version; /* PG_CONTROL_VERSION */
|
|
uint32 catalog_version_no; /* see catversion.h */
|
|
|
|
/*
|
|
* System status data
|
|
*/
|
|
DBState state; /* see enum above */
|
|
pg_time_t time; /* time stamp of last pg_control update */
|
|
XLogRecPtr checkPoint; /* last check point record ptr */
|
|
|
|
CheckPoint checkPointCopy; /* copy of last check point record */
|
|
|
|
XLogRecPtr unloggedLSN; /* current fake LSN value, for unlogged rels */
|
|
|
|
/*
|
|
* These two values determine the minimum point we must recover up to
|
|
* before starting up:
|
|
*
|
|
* minRecoveryPoint is updated to the latest replayed LSN whenever we
|
|
* flush a data change during archive recovery. That guards against
|
|
* starting archive recovery, aborting it, and restarting with an earlier
|
|
* stop location. If we've already flushed data changes from WAL record X
|
|
* to disk, we mustn't start up until we reach X again. Zero when not
|
|
* doing archive recovery.
|
|
*
|
|
* backupStartPoint is the redo pointer of the backup start checkpoint, if
|
|
* we are recovering from an online backup and haven't reached the end of
|
|
* backup yet. It is reset to zero when the end of backup is reached, and
|
|
* we mustn't start up before that. A boolean would suffice otherwise, but
|
|
* we use the redo pointer as a cross-check when we see an end-of-backup
|
|
* record, to make sure the end-of-backup record corresponds the base
|
|
* backup we're recovering from.
|
|
*
|
|
* backupEndPoint is the backup end location, if we are recovering from an
|
|
* online backup which was taken from the standby and haven't reached the
|
|
* end of backup yet. It is initialized to the minimum recovery point in
|
|
* pg_control which was backed up last. It is reset to zero when the end
|
|
* of backup is reached, and we mustn't start up before that.
|
|
*
|
|
* If backupEndRequired is true, we know for sure that we're restoring
|
|
* from a backup, and must see a backup-end record before we can safely
|
|
* start up. If it's false, but backupStartPoint is set, a backup_label
|
|
* file was found at startup but it may have been a leftover from a stray
|
|
* pg_start_backup() call, not accompanied by pg_stop_backup().
|
|
*/
|
|
XLogRecPtr minRecoveryPoint;
|
|
TimeLineID minRecoveryPointTLI;
|
|
XLogRecPtr backupStartPoint;
|
|
XLogRecPtr backupEndPoint;
|
|
bool backupEndRequired;
|
|
|
|
/*
|
|
* Parameter settings that determine if the WAL can be used for archival
|
|
* or hot standby.
|
|
*/
|
|
int wal_level;
|
|
bool wal_log_hints;
|
|
int MaxConnections;
|
|
int max_worker_processes;
|
|
int max_wal_senders;
|
|
int max_prepared_xacts;
|
|
int max_locks_per_xact;
|
|
bool track_commit_timestamp;
|
|
|
|
/*
|
|
* This data is used to check for hardware-architecture compatibility of
|
|
* the database and the backend executable. We need not check endianness
|
|
* explicitly, since the pg_control version will surely look wrong to a
|
|
* machine of different endianness, but we do need to worry about MAXALIGN
|
|
* and floating-point format. (Note: storage layout nominally also
|
|
* depends on SHORTALIGN and INTALIGN, but in practice these are the same
|
|
* on all architectures of interest.)
|
|
*
|
|
* Testing just one double value is not a very bulletproof test for
|
|
* floating-point compatibility, but it will catch most cases.
|
|
*/
|
|
uint32 maxAlign; /* alignment requirement for tuples */
|
|
double floatFormat; /* constant 1234567.0 */
|
|
#define FLOATFORMAT_VALUE 1234567.0
|
|
|
|
/*
|
|
* This data is used to make sure that configuration of this database is
|
|
* compatible with the backend executable.
|
|
*/
|
|
uint32 blcksz; /* data block size for this DB */
|
|
uint32 relseg_size; /* blocks per segment of large relation */
|
|
|
|
uint32 xlog_blcksz; /* block size within WAL files */
|
|
uint32 xlog_seg_size; /* size of each WAL segment */
|
|
|
|
uint32 nameDataLen; /* catalog name field width */
|
|
uint32 indexMaxKeys; /* max number of columns in an index */
|
|
|
|
uint32 toast_max_chunk_size; /* chunk size in TOAST tables */
|
|
uint32 loblksize; /* chunk size in pg_largeobject */
|
|
|
|
/* flags indicating pass-by-value status of various types */
|
|
bool float4ByVal; /* float4 pass-by-value? */
|
|
bool float8ByVal; /* float8, int8, etc pass-by-value? */
|
|
|
|
/* Are data pages protected by checksums? Zero if no checksum version */
|
|
uint32 data_checksum_version;
|
|
|
|
/*
|
|
* Random nonce, used in authentication requests that need to proceed
|
|
* based on values that are cluster-unique, like a SASL exchange that
|
|
* failed at an early stage.
|
|
*/
|
|
char mock_authentication_nonce[MOCK_AUTH_NONCE_LEN];
|
|
|
|
/* CRC of all above ... MUST BE LAST! */
|
|
pg_crc32c crc;
|
|
} ControlFileData;
|
|
|
|
/*
|
|
* Maximum safe value of sizeof(ControlFileData). For reliability's sake,
|
|
* it's critical that pg_control updates be atomic writes. That generally
|
|
* means the active data can't be more than one disk sector, which is 512
|
|
* bytes on common hardware. Be very careful about raising this limit.
|
|
*/
|
|
#define PG_CONTROL_MAX_SAFE_SIZE 512
|
|
|
|
/*
|
|
* Physical size of the pg_control file. Note that this is considerably
|
|
* bigger than the actually used size (ie, sizeof(ControlFileData)).
|
|
* The idea is to keep the physical size constant independent of format
|
|
* changes, so that ReadControlFile will deliver a suitable wrong-version
|
|
* message instead of a read error if it's looking at an incompatible file.
|
|
*/
|
|
#define PG_CONTROL_FILE_SIZE 8192
|
|
|
|
#endif /* PG_CONTROL_H */
|