1
0
mirror of https://github.com/postgres/postgres.git synced 2025-06-23 14:01:44 +03:00

Revamp the WAL record format.

Each WAL record now carries information about the modified relation and
block(s) in a standardized format. That makes it easier to write tools that
need that information, like pg_rewind, prefetching the blocks to speed up
recovery, etc.

There's a whole new API for building WAL records, replacing the XLogRecData
chains used previously. The new API consists of XLogRegister* functions,
which are called for each buffer and chunk of data that is added to the
record. The new API also gives more control over when a full-page image is
written, by passing flags to the XLogRegisterBuffer function.

This also simplifies the XLogReadBufferForRedo() calls. The function can dig
the relation and block number from the WAL record, so they no longer need to
be passed as arguments.

For the convenience of redo routines, XLogReader now disects each WAL record
after reading it, copying the main data part and the per-block data into
MAXALIGNed buffers. The data chunks are not aligned within the WAL record,
but the redo routines can assume that the pointers returned by XLogRecGet*
functions are. Redo routines are now passed the XLogReaderState, which
contains the record in the already-disected format, instead of the plain
XLogRecord.

The new record format also makes the fixed size XLogRecord header smaller,
by removing the xl_len field. The length of the "main data" portion is now
stored at the end of the WAL record, and there's a separate header after
XLogRecord for it. The alignment padding at the end of XLogRecord is also
removed. This compansates for the fact that the new format would otherwise
be more bulky than the old format.

Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera,
Fujii Masao.
This commit is contained in:
Heikki Linnakangas
2014-11-20 17:56:26 +02:00
parent 8dc626defe
commit 2c03216d83
93 changed files with 3945 additions and 4366 deletions

View File

@ -17,6 +17,7 @@
#include "access/xlogreader.h"
#include "access/xlogrecord.h"
#include "access/xlog_internal.h"
#include "access/transam.h"
#include "common/fe_memutils.h"
#include "getopt_long.h"
@ -343,90 +344,117 @@ XLogDumpReadPage(XLogReaderState *state, XLogRecPtr targetPagePtr, int reqLen,
* Store per-rmgr and per-record statistics for a given record.
*/
static void
XLogDumpCountRecord(XLogDumpConfig *config, XLogDumpStats *stats, XLogRecPtr ReadRecPtr, XLogRecord *record)
XLogDumpCountRecord(XLogDumpConfig *config, XLogDumpStats *stats,
XLogReaderState *record)
{
RmgrId rmid;
uint8 recid;
uint32 rec_len;
uint32 fpi_len;
stats->count++;
/* Update per-rmgr statistics */
rmid = record->xl_rmid;
rmid = XLogRecGetRmid(record);
rec_len = XLogRecGetDataLen(record) + SizeOfXLogRecord;
fpi_len = record->decoded_record->xl_tot_len - rec_len;
stats->rmgr_stats[rmid].count++;
stats->rmgr_stats[rmid].rec_len +=
record->xl_len + SizeOfXLogRecord;
stats->rmgr_stats[rmid].fpi_len +=
record->xl_tot_len - (record->xl_len + SizeOfXLogRecord);
stats->rmgr_stats[rmid].rec_len += rec_len;
stats->rmgr_stats[rmid].fpi_len += fpi_len;
/*
* Update per-record statistics, where the record is identified by a
* combination of the RmgrId and the four bits of the xl_info field
* that are the rmgr's domain (resulting in sixteen possible entries
* per RmgrId).
* combination of the RmgrId and the four bits of the xl_info field that
* are the rmgr's domain (resulting in sixteen possible entries per
* RmgrId).
*/
recid = record->xl_info >> 4;
recid = XLogRecGetInfo(record) >> 4;
stats->record_stats[rmid][recid].count++;
stats->record_stats[rmid][recid].rec_len +=
record->xl_len + SizeOfXLogRecord;
stats->record_stats[rmid][recid].fpi_len +=
record->xl_tot_len - (record->xl_len + SizeOfXLogRecord);
stats->record_stats[rmid][recid].rec_len += rec_len;
stats->record_stats[rmid][recid].fpi_len += fpi_len;
}
/*
* Print a record to stdout
*/
static void
XLogDumpDisplayRecord(XLogDumpConfig *config, XLogRecPtr ReadRecPtr, XLogRecord *record)
XLogDumpDisplayRecord(XLogDumpConfig *config, XLogReaderState *record)
{
const char *id;
const RmgrDescData *desc = &RmgrDescTable[record->xl_rmid];
const char *id;
const RmgrDescData *desc = &RmgrDescTable[XLogRecGetRmid(record)];
RelFileNode rnode;
ForkNumber forknum;
BlockNumber blk;
int block_id;
uint8 info = XLogRecGetInfo(record);
XLogRecPtr xl_prev = XLogRecGetPrev(record);
id = desc->rm_identify(record->xl_info);
id = desc->rm_identify(info);
if (id == NULL)
id = psprintf("UNKNOWN (%x)", record->xl_info & ~XLR_INFO_MASK);
id = psprintf("UNKNOWN (%x)", info & ~XLR_INFO_MASK);
printf("rmgr: %-11s len (rec/tot): %6u/%6u, tx: %10u, lsn: %X/%08X, prev %X/%08X, bkp: %u%u%u%u, desc: %s ",
printf("rmgr: %-11s len (rec/tot): %6u/%6u, tx: %10u, lsn: %X/%08X, prev %X/%08X, ",
desc->rm_name,
record->xl_len, record->xl_tot_len,
record->xl_xid,
(uint32) (ReadRecPtr >> 32), (uint32) ReadRecPtr,
(uint32) (record->xl_prev >> 32), (uint32) record->xl_prev,
!!(XLR_BKP_BLOCK(0) & record->xl_info),
!!(XLR_BKP_BLOCK(1) & record->xl_info),
!!(XLR_BKP_BLOCK(2) & record->xl_info),
!!(XLR_BKP_BLOCK(3) & record->xl_info),
id);
XLogRecGetDataLen(record), XLogRecGetTotalLen(record),
XLogRecGetXid(record),
(uint32) (record->ReadRecPtr >> 32), (uint32) record->ReadRecPtr,
(uint32) (xl_prev >> 32), (uint32) xl_prev);
printf("desc: %s ", id);
/* the desc routine will printf the description directly to stdout */
desc->rm_desc(NULL, record);
putchar('\n');
if (config->bkp_details)
if (!config->bkp_details)
{
int bkpnum;
char *blk = (char *) XLogRecGetData(record) + record->xl_len;
for (bkpnum = 0; bkpnum < XLR_MAX_BKP_BLOCKS; bkpnum++)
/* print block references (short format) */
for (block_id = 0; block_id <= record->max_block_id; block_id++)
{
BkpBlock bkpb;
if (!(XLR_BKP_BLOCK(bkpnum) & record->xl_info))
if (!XLogRecHasBlockRef(record, block_id))
continue;
memcpy(&bkpb, blk, sizeof(BkpBlock));
blk += sizeof(BkpBlock);
blk += BLCKSZ - bkpb.hole_length;
XLogRecGetBlockTag(record, block_id, &rnode, &forknum, &blk);
if (forknum != MAIN_FORKNUM)
printf(", blkref #%u: rel %u/%u/%u fork %s blk %u",
block_id,
rnode.spcNode, rnode.dbNode, rnode.relNode,
forkNames[forknum],
blk);
else
printf(", blkref #%u: rel %u/%u/%u blk %u",
block_id,
rnode.spcNode, rnode.dbNode, rnode.relNode,
blk);
if (XLogRecHasBlockImage(record, block_id))
printf(" FPW");
}
putchar('\n');
}
else
{
/* print block references (detailed format) */
putchar('\n');
for (block_id = 0; block_id <= record->max_block_id; block_id++)
{
if (!XLogRecHasBlockRef(record, block_id))
continue;
printf("\tbackup bkp #%u; rel %u/%u/%u; fork: %s; block: %u; hole: offset: %u, length: %u\n",
bkpnum,
bkpb.node.spcNode, bkpb.node.dbNode, bkpb.node.relNode,
forkNames[bkpb.fork],
bkpb.block, bkpb.hole_offset, bkpb.hole_length);
XLogRecGetBlockTag(record, block_id, &rnode, &forknum, &blk);
printf("\tblkref #%u: rel %u/%u/%u fork %s blk %u",
block_id,
rnode.spcNode, rnode.dbNode, rnode.relNode,
forkNames[forknum],
blk);
if (XLogRecHasBlockImage(record, block_id))
{
printf(" (FPW); hole: offset: %u, length: %u\n",
record->blocks[block_id].hole_offset,
record->blocks[block_id].hole_length);
}
putchar('\n');
}
}
}
@ -924,9 +952,9 @@ main(int argc, char **argv)
/* process the record */
if (config.stats == true)
XLogDumpCountRecord(&config, &stats, xlogreader_state->ReadRecPtr, record);
XLogDumpCountRecord(&config, &stats, xlogreader_state);
else
XLogDumpDisplayRecord(&config, xlogreader_state->ReadRecPtr, record);
XLogDumpDisplayRecord(&config, xlogreader_state);
/* check whether we printed enough */
config.already_displayed_records++;

View File

@ -13,7 +13,7 @@
typedef struct RmgrDescData
{
const char *rm_name;
void (*rm_desc) (StringInfo buf, XLogRecord *record);
void (*rm_desc) (StringInfo buf, XLogReaderState *record);
const char *(*rm_identify) (uint8 info);
} RmgrDescData;