mirror of
https://github.com/postgres/postgres.git
synced 2025-04-29 13:56:47 +03:00
Per gripes from Michael Paquier Discussion: https://postgr.es/m/ZhTQ6_w1vwOhqTQI@paquier.xyz Along the way, also clean up a handful of typos in 3311ea86ed and ea7b4e9a2a, found by Alexander Lakhin, and a couple of stylistic snafus noted by Daniel Westermann and Daniel Gustafsson.
1124 lines
37 KiB
C
1124 lines
37 KiB
C
/*-------------------------------------------------------------------------
|
|
*
|
|
* basebackup_incremental.c
|
|
* code for incremental backup support
|
|
*
|
|
* This code isn't actually in charge of taking an incremental backup;
|
|
* the actual construction of the incremental backup happens in
|
|
* basebackup.c. Here, we're concerned with providing the necessary
|
|
* supports for that operation. In particular, we need to parse the
|
|
* backup manifest supplied by the user taking the incremental backup
|
|
* and extract the required information from it.
|
|
*
|
|
* Portions Copyright (c) 2010-2024, PostgreSQL Global Development Group
|
|
*
|
|
* IDENTIFICATION
|
|
* src/backend/backup/basebackup_incremental.c
|
|
*
|
|
*-------------------------------------------------------------------------
|
|
*/
|
|
#include "postgres.h"
|
|
|
|
#include "access/timeline.h"
|
|
#include "access/xlog.h"
|
|
#include "backup/basebackup_incremental.h"
|
|
#include "backup/walsummary.h"
|
|
#include "common/blkreftable.h"
|
|
#include "common/hashfn.h"
|
|
#include "common/int.h"
|
|
#include "common/parse_manifest.h"
|
|
#include "datatype/timestamp.h"
|
|
#include "postmaster/walsummarizer.h"
|
|
#include "utils/timestamp.h"
|
|
|
|
#define BLOCKS_PER_READ 512
|
|
|
|
/*
|
|
* We expect to find the last lines of the manifest, including the checksum,
|
|
* in the last MIN_CHUNK bytes of the manifest. We trigger an incremental
|
|
* parse step if we are about to overflow MAX_CHUNK bytes.
|
|
*/
|
|
#define MIN_CHUNK 1024
|
|
#define MAX_CHUNK (128 * 1024)
|
|
|
|
/*
|
|
* Details extracted from the WAL ranges present in the supplied backup manifest.
|
|
*/
|
|
typedef struct
|
|
{
|
|
TimeLineID tli;
|
|
XLogRecPtr start_lsn;
|
|
XLogRecPtr end_lsn;
|
|
} backup_wal_range;
|
|
|
|
/*
|
|
* Details extracted from the file list present in the supplied backup manifest.
|
|
*/
|
|
typedef struct
|
|
{
|
|
uint32 status;
|
|
const char *path;
|
|
size_t size;
|
|
} backup_file_entry;
|
|
|
|
static uint32 hash_string_pointer(const char *s);
|
|
#define SH_PREFIX backup_file
|
|
#define SH_ELEMENT_TYPE backup_file_entry
|
|
#define SH_KEY_TYPE const char *
|
|
#define SH_KEY path
|
|
#define SH_HASH_KEY(tb, key) hash_string_pointer(key)
|
|
#define SH_EQUAL(tb, a, b) (strcmp(a, b) == 0)
|
|
#define SH_SCOPE static inline
|
|
#define SH_DECLARE
|
|
#define SH_DEFINE
|
|
#include "lib/simplehash.h"
|
|
|
|
struct IncrementalBackupInfo
|
|
{
|
|
/* Memory context for this object and its subsidiary objects. */
|
|
MemoryContext mcxt;
|
|
|
|
/* Temporary buffer for storing the manifest while parsing it. */
|
|
StringInfoData buf;
|
|
|
|
/* WAL ranges extracted from the backup manifest. */
|
|
List *manifest_wal_ranges;
|
|
|
|
/*
|
|
* Files extracted from the backup manifest.
|
|
*
|
|
* We don't really need this information, because we use WAL summaries to
|
|
* figure out what's changed. It would be unsafe to just rely on the list
|
|
* of files that existed before, because it's possible for a file to be
|
|
* removed and a new one created with the same name and different
|
|
* contents. In such cases, the whole file must still be sent. We can tell
|
|
* from the WAL summaries whether that happened, but not from the file
|
|
* list.
|
|
*
|
|
* Nonetheless, this data is useful for sanity checking. If a file that we
|
|
* think we shouldn't need to send is not present in the manifest for the
|
|
* prior backup, something has gone terribly wrong. We retain the file
|
|
* names and sizes, but not the checksums or last modified times, for
|
|
* which we have no use.
|
|
*
|
|
* One significant downside of storing this data is that it consumes
|
|
* memory. If that turns out to be a problem, we might have to decide not
|
|
* to retain this information, or to make it optional.
|
|
*/
|
|
backup_file_hash *manifest_files;
|
|
|
|
/*
|
|
* Block-reference table for the incremental backup.
|
|
*
|
|
* It's possible that storing the entire block-reference table in memory
|
|
* will be a problem for some users. The in-memory format that we're using
|
|
* here is pretty efficient, converging to little more than 1 bit per
|
|
* block for relation forks with large numbers of modified blocks. It's
|
|
* possible, however, that if you try to perform an incremental backup of
|
|
* a database with a sufficiently large number of relations on a
|
|
* sufficiently small machine, you could run out of memory here. If that
|
|
* turns out to be a problem in practice, we'll need to be more clever.
|
|
*/
|
|
BlockRefTable *brtab;
|
|
|
|
/*
|
|
* State object for incremental JSON parsing
|
|
*/
|
|
JsonManifestParseIncrementalState *inc_state;
|
|
};
|
|
|
|
static void manifest_process_version(JsonManifestParseContext *context,
|
|
int manifest_version);
|
|
static void manifest_process_system_identifier(JsonManifestParseContext *context,
|
|
uint64 manifest_system_identifier);
|
|
static void manifest_process_file(JsonManifestParseContext *context,
|
|
char *pathname,
|
|
size_t size,
|
|
pg_checksum_type checksum_type,
|
|
int checksum_length,
|
|
uint8 *checksum_payload);
|
|
static void manifest_process_wal_range(JsonManifestParseContext *context,
|
|
TimeLineID tli,
|
|
XLogRecPtr start_lsn,
|
|
XLogRecPtr end_lsn);
|
|
static void manifest_report_error(JsonManifestParseContext *ib,
|
|
const char *fmt,...)
|
|
pg_attribute_printf(2, 3) pg_attribute_noreturn();
|
|
static int compare_block_numbers(const void *a, const void *b);
|
|
|
|
/*
|
|
* Create a new object for storing information extracted from the manifest
|
|
* supplied when creating an incremental backup.
|
|
*/
|
|
IncrementalBackupInfo *
|
|
CreateIncrementalBackupInfo(MemoryContext mcxt)
|
|
{
|
|
IncrementalBackupInfo *ib;
|
|
MemoryContext oldcontext;
|
|
JsonManifestParseContext *context;
|
|
|
|
oldcontext = MemoryContextSwitchTo(mcxt);
|
|
|
|
ib = palloc0(sizeof(IncrementalBackupInfo));
|
|
ib->mcxt = mcxt;
|
|
initStringInfo(&ib->buf);
|
|
|
|
/*
|
|
* It's hard to guess how many files a "typical" installation will have in
|
|
* the data directory, but a fresh initdb creates almost 1000 files as of
|
|
* this writing, so it seems to make sense for our estimate to
|
|
* substantially higher.
|
|
*/
|
|
ib->manifest_files = backup_file_create(mcxt, 10000, NULL);
|
|
|
|
context = palloc0(sizeof(JsonManifestParseContext));
|
|
/* Parse the manifest. */
|
|
context->private_data = ib;
|
|
context->version_cb = manifest_process_version;
|
|
context->system_identifier_cb = manifest_process_system_identifier;
|
|
context->per_file_cb = manifest_process_file;
|
|
context->per_wal_range_cb = manifest_process_wal_range;
|
|
context->error_cb = manifest_report_error;
|
|
|
|
ib->inc_state = json_parse_manifest_incremental_init(context);
|
|
|
|
MemoryContextSwitchTo(oldcontext);
|
|
|
|
return ib;
|
|
}
|
|
|
|
/*
|
|
* Before taking an incremental backup, the caller must supply the backup
|
|
* manifest from a prior backup. Each chunk of manifest data received
|
|
* from the client should be passed to this function.
|
|
*/
|
|
void
|
|
AppendIncrementalManifestData(IncrementalBackupInfo *ib, const char *data,
|
|
int len)
|
|
{
|
|
MemoryContext oldcontext;
|
|
|
|
/* Switch to our memory context. */
|
|
oldcontext = MemoryContextSwitchTo(ib->mcxt);
|
|
|
|
if (ib->buf.len > MIN_CHUNK && ib->buf.len + len > MAX_CHUNK)
|
|
{
|
|
/*
|
|
* time for an incremental parse. We'll do all but the last MIN_CHUNK
|
|
* so that we have enough left for the final piece.
|
|
*/
|
|
json_parse_manifest_incremental_chunk(
|
|
ib->inc_state, ib->buf.data, ib->buf.len - MIN_CHUNK, false);
|
|
/* now remove what we just parsed */
|
|
memmove(ib->buf.data, ib->buf.data + (ib->buf.len - MIN_CHUNK),
|
|
MIN_CHUNK + 1);
|
|
ib->buf.len = MIN_CHUNK;
|
|
}
|
|
|
|
appendBinaryStringInfo(&ib->buf, data, len);
|
|
|
|
/* Switch back to previous memory context. */
|
|
MemoryContextSwitchTo(oldcontext);
|
|
}
|
|
|
|
/*
|
|
* Finalize an IncrementalBackupInfo object after all manifest data has
|
|
* been supplied via calls to AppendIncrementalManifestData.
|
|
*/
|
|
void
|
|
FinalizeIncrementalManifest(IncrementalBackupInfo *ib)
|
|
{
|
|
MemoryContext oldcontext;
|
|
|
|
/* Switch to our memory context. */
|
|
oldcontext = MemoryContextSwitchTo(ib->mcxt);
|
|
|
|
/* Parse the last chunk of the manifest */
|
|
json_parse_manifest_incremental_chunk(
|
|
ib->inc_state, ib->buf.data, ib->buf.len, true);
|
|
|
|
/* Done with the buffer, so release memory. */
|
|
pfree(ib->buf.data);
|
|
ib->buf.data = NULL;
|
|
|
|
/* Done with inc_state, so release that memory too */
|
|
json_parse_manifest_incremental_shutdown(ib->inc_state);
|
|
|
|
/* Switch back to previous memory context. */
|
|
MemoryContextSwitchTo(oldcontext);
|
|
}
|
|
|
|
/*
|
|
* Prepare to take an incremental backup.
|
|
*
|
|
* Before this function is called, AppendIncrementalManifestData and
|
|
* FinalizeIncrementalManifest should have already been called to pass all
|
|
* the manifest data to this object.
|
|
*
|
|
* This function performs sanity checks on the data extracted from the
|
|
* manifest and figures out for which WAL ranges we need summaries, and
|
|
* whether those summaries are available. Then, it reads and combines the
|
|
* data from those summary files. It also updates the backup_state with the
|
|
* reference TLI and LSN for the prior backup.
|
|
*/
|
|
void
|
|
PrepareForIncrementalBackup(IncrementalBackupInfo *ib,
|
|
BackupState *backup_state)
|
|
{
|
|
MemoryContext oldcontext;
|
|
List *expectedTLEs;
|
|
List *all_wslist,
|
|
*required_wslist = NIL;
|
|
ListCell *lc;
|
|
TimeLineHistoryEntry **tlep;
|
|
int num_wal_ranges;
|
|
int i;
|
|
bool found_backup_start_tli = false;
|
|
TimeLineID earliest_wal_range_tli = 0;
|
|
XLogRecPtr earliest_wal_range_start_lsn = InvalidXLogRecPtr;
|
|
TimeLineID latest_wal_range_tli = 0;
|
|
XLogRecPtr summarized_lsn;
|
|
XLogRecPtr pending_lsn;
|
|
XLogRecPtr prior_pending_lsn = InvalidXLogRecPtr;
|
|
int deadcycles = 0;
|
|
TimestampTz initial_time,
|
|
current_time;
|
|
|
|
Assert(ib->buf.data == NULL);
|
|
|
|
/* Switch to our memory context. */
|
|
oldcontext = MemoryContextSwitchTo(ib->mcxt);
|
|
|
|
/*
|
|
* A valid backup manifest must always contain at least one WAL range
|
|
* (usually exactly one, unless the backup spanned a timeline switch).
|
|
*/
|
|
num_wal_ranges = list_length(ib->manifest_wal_ranges);
|
|
if (num_wal_ranges == 0)
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
|
|
errmsg("manifest contains no required WAL ranges")));
|
|
|
|
/*
|
|
* Match up the TLIs that appear in the WAL ranges of the backup manifest
|
|
* with those that appear in this server's timeline history. We expect
|
|
* every backup_wal_range to match to a TimeLineHistoryEntry; if it does
|
|
* not, that's an error.
|
|
*
|
|
* This loop also decides which of the WAL ranges is the manifest is most
|
|
* ancient and which one is the newest, according to the timeline history
|
|
* of this server, and stores TLIs of those WAL ranges into
|
|
* earliest_wal_range_tli and latest_wal_range_tli. It also updates
|
|
* earliest_wal_range_start_lsn to the start LSN of the WAL range for
|
|
* earliest_wal_range_tli.
|
|
*
|
|
* Note that the return value of readTimeLineHistory puts the latest
|
|
* timeline at the beginning of the list, not the end. Hence, the earliest
|
|
* TLI is the one that occurs nearest the end of the list returned by
|
|
* readTimeLineHistory, and the latest TLI is the one that occurs closest
|
|
* to the beginning.
|
|
*/
|
|
expectedTLEs = readTimeLineHistory(backup_state->starttli);
|
|
tlep = palloc0(num_wal_ranges * sizeof(TimeLineHistoryEntry *));
|
|
for (i = 0; i < num_wal_ranges; ++i)
|
|
{
|
|
backup_wal_range *range = list_nth(ib->manifest_wal_ranges, i);
|
|
bool saw_earliest_wal_range_tli = false;
|
|
bool saw_latest_wal_range_tli = false;
|
|
|
|
/* Search this server's history for this WAL range's TLI. */
|
|
foreach(lc, expectedTLEs)
|
|
{
|
|
TimeLineHistoryEntry *tle = lfirst(lc);
|
|
|
|
if (tle->tli == range->tli)
|
|
{
|
|
tlep[i] = tle;
|
|
break;
|
|
}
|
|
|
|
if (tle->tli == earliest_wal_range_tli)
|
|
saw_earliest_wal_range_tli = true;
|
|
if (tle->tli == latest_wal_range_tli)
|
|
saw_latest_wal_range_tli = true;
|
|
}
|
|
|
|
/*
|
|
* An incremental backup can only be taken relative to a backup that
|
|
* represents a previous state of this server. If the backup requires
|
|
* WAL from a timeline that's not in our history, that definitely
|
|
* isn't the case.
|
|
*/
|
|
if (tlep[i] == NULL)
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
|
|
errmsg("timeline %u found in manifest, but not in this server's history",
|
|
range->tli)));
|
|
|
|
/*
|
|
* If we found this TLI in the server's history before encountering
|
|
* the latest TLI seen so far in the server's history, then this TLI
|
|
* is the latest one seen so far.
|
|
*
|
|
* If on the other hand we saw the earliest TLI seen so far before
|
|
* finding this TLI, this TLI is earlier than the earliest one seen so
|
|
* far. And if this is the first TLI for which we've searched, it's
|
|
* also the earliest one seen so far.
|
|
*
|
|
* On the first loop iteration, both things should necessarily be
|
|
* true.
|
|
*/
|
|
if (!saw_latest_wal_range_tli)
|
|
latest_wal_range_tli = range->tli;
|
|
if (earliest_wal_range_tli == 0 || saw_earliest_wal_range_tli)
|
|
{
|
|
earliest_wal_range_tli = range->tli;
|
|
earliest_wal_range_start_lsn = range->start_lsn;
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Propagate information about the prior backup into the backup_label that
|
|
* will be generated for this backup.
|
|
*/
|
|
backup_state->istartpoint = earliest_wal_range_start_lsn;
|
|
backup_state->istarttli = earliest_wal_range_tli;
|
|
|
|
/*
|
|
* Sanity check start and end LSNs for the WAL ranges in the manifest.
|
|
*
|
|
* Commonly, there won't be any timeline switches during the prior backup
|
|
* at all, but if there are, they should happen at the same LSNs that this
|
|
* server switched timelines.
|
|
*
|
|
* Whether there are any timeline switches during the prior backup or not,
|
|
* the prior backup shouldn't require any WAL from a timeline prior to the
|
|
* start of that timeline. It also shouldn't require any WAL from later
|
|
* than the start of this backup.
|
|
*
|
|
* If any of these sanity checks fail, one possible explanation is that
|
|
* the user has generated WAL on the same timeline with the same LSNs more
|
|
* than once. For instance, if two standbys running on timeline 1 were
|
|
* both promoted and (due to a broken archiving setup) both selected new
|
|
* timeline ID 2, then it's possible that one of these checks might trip.
|
|
*
|
|
* Note that there are lots of ways for the user to do something very bad
|
|
* without tripping any of these checks, and they are not intended to be
|
|
* comprehensive. It's pretty hard to see how we could be certain of
|
|
* anything here. However, if there's a problem staring us right in the
|
|
* face, it's best to report it, so we do.
|
|
*/
|
|
for (i = 0; i < num_wal_ranges; ++i)
|
|
{
|
|
backup_wal_range *range = list_nth(ib->manifest_wal_ranges, i);
|
|
|
|
if (range->tli == earliest_wal_range_tli)
|
|
{
|
|
if (range->start_lsn < tlep[i]->begin)
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
|
|
errmsg("manifest requires WAL from initial timeline %u starting at %X/%X, but that timeline begins at %X/%X",
|
|
range->tli,
|
|
LSN_FORMAT_ARGS(range->start_lsn),
|
|
LSN_FORMAT_ARGS(tlep[i]->begin))));
|
|
}
|
|
else
|
|
{
|
|
if (range->start_lsn != tlep[i]->begin)
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
|
|
errmsg("manifest requires WAL from continuation timeline %u starting at %X/%X, but that timeline begins at %X/%X",
|
|
range->tli,
|
|
LSN_FORMAT_ARGS(range->start_lsn),
|
|
LSN_FORMAT_ARGS(tlep[i]->begin))));
|
|
}
|
|
|
|
if (range->tli == latest_wal_range_tli)
|
|
{
|
|
if (range->end_lsn > backup_state->startpoint)
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
|
|
errmsg("manifest requires WAL from final timeline %u ending at %X/%X, but this backup starts at %X/%X",
|
|
range->tli,
|
|
LSN_FORMAT_ARGS(range->end_lsn),
|
|
LSN_FORMAT_ARGS(backup_state->startpoint))));
|
|
}
|
|
else
|
|
{
|
|
if (range->end_lsn != tlep[i]->end)
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
|
|
errmsg("manifest requires WAL from non-final timeline %u ending at %X/%X, but this server switched timelines at %X/%X",
|
|
range->tli,
|
|
LSN_FORMAT_ARGS(range->end_lsn),
|
|
LSN_FORMAT_ARGS(tlep[i]->end))));
|
|
}
|
|
|
|
}
|
|
|
|
/*
|
|
* Wait for WAL summarization to catch up to the backup start LSN (but
|
|
* time out if it doesn't do so quickly enough).
|
|
*/
|
|
initial_time = current_time = GetCurrentTimestamp();
|
|
while (1)
|
|
{
|
|
long timeout_in_ms = 10000;
|
|
long elapsed_seconds;
|
|
|
|
/*
|
|
* Align the wait time to prevent drift. This doesn't really matter,
|
|
* but we'd like the warnings about how long we've been waiting to say
|
|
* 10 seconds, 20 seconds, 30 seconds, 40 seconds ... without ever
|
|
* drifting to something that is not a multiple of ten.
|
|
*/
|
|
timeout_in_ms -=
|
|
TimestampDifferenceMilliseconds(initial_time, current_time) %
|
|
timeout_in_ms;
|
|
|
|
/* Wait for up to 10 seconds. */
|
|
summarized_lsn = WaitForWalSummarization(backup_state->startpoint,
|
|
timeout_in_ms, &pending_lsn);
|
|
|
|
/* If WAL summarization has progressed sufficiently, stop waiting. */
|
|
if (summarized_lsn >= backup_state->startpoint)
|
|
break;
|
|
|
|
/*
|
|
* Keep track of the number of cycles during which there has been no
|
|
* progression of pending_lsn. If pending_lsn is not advancing, that
|
|
* means that not only are no new files appearing on disk, but we're
|
|
* not even incorporating new records into the in-memory state.
|
|
*/
|
|
if (pending_lsn > prior_pending_lsn)
|
|
{
|
|
prior_pending_lsn = pending_lsn;
|
|
deadcycles = 0;
|
|
}
|
|
else
|
|
++deadcycles;
|
|
|
|
/*
|
|
* If we've managed to wait for an entire minute without the WAL
|
|
* summarizer absorbing a single WAL record, error out; probably
|
|
* something is wrong.
|
|
*
|
|
* We could consider also erroring out if the summarizer is taking too
|
|
* long to catch up, but it's not clear what rate of progress would be
|
|
* acceptable and what would be too slow. So instead, we just try to
|
|
* error out in the case where there's no progress at all. That seems
|
|
* likely to catch a reasonable number of the things that can go wrong
|
|
* in practice (e.g. the summarizer process is completely hung, say
|
|
* because somebody hooked up a debugger to it or something) without
|
|
* giving up too quickly when the system is just slow.
|
|
*/
|
|
if (deadcycles >= 6)
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
|
|
errmsg("WAL summarization is not progressing"),
|
|
errdetail("Summarization is needed through %X/%X, but is stuck at %X/%X on disk and %X/%X in memory.",
|
|
LSN_FORMAT_ARGS(backup_state->startpoint),
|
|
LSN_FORMAT_ARGS(summarized_lsn),
|
|
LSN_FORMAT_ARGS(pending_lsn))));
|
|
|
|
/*
|
|
* Otherwise, just let the user know what's happening.
|
|
*/
|
|
current_time = GetCurrentTimestamp();
|
|
elapsed_seconds =
|
|
TimestampDifferenceMilliseconds(initial_time, current_time) / 1000;
|
|
ereport(WARNING,
|
|
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
|
|
errmsg("still waiting for WAL summarization through %X/%X after %ld seconds",
|
|
LSN_FORMAT_ARGS(backup_state->startpoint),
|
|
elapsed_seconds),
|
|
errdetail("Summarization has reached %X/%X on disk and %X/%X in memory.",
|
|
LSN_FORMAT_ARGS(summarized_lsn),
|
|
LSN_FORMAT_ARGS(pending_lsn))));
|
|
}
|
|
|
|
/*
|
|
* Retrieve a list of all WAL summaries on any timeline that overlap with
|
|
* the LSN range of interest. We could instead call GetWalSummaries() once
|
|
* per timeline in the loop that follows, but that would involve reading
|
|
* the directory multiple times. It should be mildly faster - and perhaps
|
|
* a bit safer - to do it just once.
|
|
*/
|
|
all_wslist = GetWalSummaries(0, earliest_wal_range_start_lsn,
|
|
backup_state->startpoint);
|
|
|
|
/*
|
|
* We need WAL summaries for everything that happened during the prior
|
|
* backup and everything that happened afterward up until the point where
|
|
* the current backup started.
|
|
*/
|
|
foreach(lc, expectedTLEs)
|
|
{
|
|
TimeLineHistoryEntry *tle = lfirst(lc);
|
|
XLogRecPtr tli_start_lsn = tle->begin;
|
|
XLogRecPtr tli_end_lsn = tle->end;
|
|
XLogRecPtr tli_missing_lsn = InvalidXLogRecPtr;
|
|
List *tli_wslist;
|
|
|
|
/*
|
|
* Working through the history of this server from the current
|
|
* timeline backwards, we skip everything until we find the timeline
|
|
* where this backup started. Most of the time, this means we won't
|
|
* skip anything at all, as it's unlikely that the timeline has
|
|
* changed since the beginning of the backup moments ago.
|
|
*/
|
|
if (tle->tli == backup_state->starttli)
|
|
{
|
|
found_backup_start_tli = true;
|
|
tli_end_lsn = backup_state->startpoint;
|
|
}
|
|
else if (!found_backup_start_tli)
|
|
continue;
|
|
|
|
/*
|
|
* Find the summaries that overlap the LSN range of interest for this
|
|
* timeline. If this is the earliest timeline involved, the range of
|
|
* interest begins with the start LSN of the prior backup; otherwise,
|
|
* it begins at the LSN at which this timeline came into existence. If
|
|
* this is the latest TLI involved, the range of interest ends at the
|
|
* start LSN of the current backup; otherwise, it ends at the point
|
|
* where we switched from this timeline to the next one.
|
|
*/
|
|
if (tle->tli == earliest_wal_range_tli)
|
|
tli_start_lsn = earliest_wal_range_start_lsn;
|
|
tli_wslist = FilterWalSummaries(all_wslist, tle->tli,
|
|
tli_start_lsn, tli_end_lsn);
|
|
|
|
/*
|
|
* There is no guarantee that the WAL summaries we found cover the
|
|
* entire range of LSNs for which summaries are required, or indeed
|
|
* that we found any WAL summaries at all. Check whether we have a
|
|
* problem of that sort.
|
|
*/
|
|
if (!WalSummariesAreComplete(tli_wslist, tli_start_lsn, tli_end_lsn,
|
|
&tli_missing_lsn))
|
|
{
|
|
if (XLogRecPtrIsInvalid(tli_missing_lsn))
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
|
|
errmsg("WAL summaries are required on timeline %u from %X/%X to %X/%X, but no summaries for that timeline and LSN range exist",
|
|
tle->tli,
|
|
LSN_FORMAT_ARGS(tli_start_lsn),
|
|
LSN_FORMAT_ARGS(tli_end_lsn))));
|
|
else
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
|
|
errmsg("WAL summaries are required on timeline %u from %X/%X to %X/%X, but the summaries for that timeline and LSN range are incomplete",
|
|
tle->tli,
|
|
LSN_FORMAT_ARGS(tli_start_lsn),
|
|
LSN_FORMAT_ARGS(tli_end_lsn)),
|
|
errdetail("The first unsummarized LSN in this range is %X/%X.",
|
|
LSN_FORMAT_ARGS(tli_missing_lsn))));
|
|
}
|
|
|
|
/*
|
|
* Remember that we need to read these summaries.
|
|
*
|
|
* Technically, it's possible that this could read more files than
|
|
* required, since tli_wslist in theory could contain redundant
|
|
* summaries. For instance, if we have a summary from 0/10000000 to
|
|
* 0/20000000 and also one from 0/00000000 to 0/30000000, then the
|
|
* latter subsumes the former and the former could be ignored.
|
|
*
|
|
* We ignore this possibility because the WAL summarizer only tries to
|
|
* generate summaries that do not overlap. If somehow they exist,
|
|
* we'll do a bit of extra work but the results should still be
|
|
* correct.
|
|
*/
|
|
required_wslist = list_concat(required_wslist, tli_wslist);
|
|
|
|
/*
|
|
* Timelines earlier than the one in which the prior backup began are
|
|
* not relevant.
|
|
*/
|
|
if (tle->tli == earliest_wal_range_tli)
|
|
break;
|
|
}
|
|
|
|
/*
|
|
* Read all of the required block reference table files and merge all of
|
|
* the data into a single in-memory block reference table.
|
|
*
|
|
* See the comments for struct IncrementalBackupInfo for some thoughts on
|
|
* memory usage.
|
|
*/
|
|
ib->brtab = CreateEmptyBlockRefTable();
|
|
foreach(lc, required_wslist)
|
|
{
|
|
WalSummaryFile *ws = lfirst(lc);
|
|
WalSummaryIO wsio;
|
|
BlockRefTableReader *reader;
|
|
RelFileLocator rlocator;
|
|
ForkNumber forknum;
|
|
BlockNumber limit_block;
|
|
BlockNumber blocks[BLOCKS_PER_READ];
|
|
|
|
wsio.file = OpenWalSummaryFile(ws, false);
|
|
wsio.filepos = 0;
|
|
ereport(DEBUG1,
|
|
(errmsg_internal("reading WAL summary file \"%s\"",
|
|
FilePathName(wsio.file))));
|
|
reader = CreateBlockRefTableReader(ReadWalSummary, &wsio,
|
|
FilePathName(wsio.file),
|
|
ReportWalSummaryError, NULL);
|
|
while (BlockRefTableReaderNextRelation(reader, &rlocator, &forknum,
|
|
&limit_block))
|
|
{
|
|
BlockRefTableSetLimitBlock(ib->brtab, &rlocator,
|
|
forknum, limit_block);
|
|
|
|
while (1)
|
|
{
|
|
unsigned nblocks;
|
|
unsigned i;
|
|
|
|
nblocks = BlockRefTableReaderGetBlocks(reader, blocks,
|
|
BLOCKS_PER_READ);
|
|
if (nblocks == 0)
|
|
break;
|
|
|
|
for (i = 0; i < nblocks; ++i)
|
|
BlockRefTableMarkBlockModified(ib->brtab, &rlocator,
|
|
forknum, blocks[i]);
|
|
}
|
|
}
|
|
DestroyBlockRefTableReader(reader);
|
|
FileClose(wsio.file);
|
|
}
|
|
|
|
/* Switch back to previous memory context. */
|
|
MemoryContextSwitchTo(oldcontext);
|
|
}
|
|
|
|
/*
|
|
* Get the pathname that should be used when a file is sent incrementally.
|
|
*
|
|
* The result is a palloc'd string.
|
|
*/
|
|
char *
|
|
GetIncrementalFilePath(Oid dboid, Oid spcoid, RelFileNumber relfilenumber,
|
|
ForkNumber forknum, unsigned segno)
|
|
{
|
|
char *path;
|
|
char *lastslash;
|
|
char *ipath;
|
|
|
|
path = GetRelationPath(dboid, spcoid, relfilenumber, INVALID_PROC_NUMBER,
|
|
forknum);
|
|
|
|
lastslash = strrchr(path, '/');
|
|
Assert(lastslash != NULL);
|
|
*lastslash = '\0';
|
|
|
|
if (segno > 0)
|
|
ipath = psprintf("%s/INCREMENTAL.%s.%u", path, lastslash + 1, segno);
|
|
else
|
|
ipath = psprintf("%s/INCREMENTAL.%s", path, lastslash + 1);
|
|
|
|
pfree(path);
|
|
|
|
return ipath;
|
|
}
|
|
|
|
/*
|
|
* How should we back up a particular file as part of an incremental backup?
|
|
*
|
|
* If the return value is BACK_UP_FILE_FULLY, caller should back up the whole
|
|
* file just as if this were not an incremental backup. The contents of the
|
|
* relative_block_numbers array are unspecified in this case.
|
|
*
|
|
* If the return value is BACK_UP_FILE_INCREMENTALLY, caller should include
|
|
* an incremental file in the backup instead of the entire file. On return,
|
|
* *num_blocks_required will be set to the number of blocks that need to be
|
|
* sent, and the actual block numbers will have been stored in
|
|
* relative_block_numbers, which should be an array of at least RELSEG_SIZE.
|
|
* In addition, *truncation_block_length will be set to the value that should
|
|
* be included in the incremental file.
|
|
*/
|
|
FileBackupMethod
|
|
GetFileBackupMethod(IncrementalBackupInfo *ib, const char *path,
|
|
Oid dboid, Oid spcoid,
|
|
RelFileNumber relfilenumber, ForkNumber forknum,
|
|
unsigned segno, size_t size,
|
|
unsigned *num_blocks_required,
|
|
BlockNumber *relative_block_numbers,
|
|
unsigned *truncation_block_length)
|
|
{
|
|
BlockNumber limit_block;
|
|
BlockNumber start_blkno;
|
|
BlockNumber stop_blkno;
|
|
RelFileLocator rlocator;
|
|
BlockRefTableEntry *brtentry;
|
|
unsigned i;
|
|
unsigned nblocks;
|
|
|
|
/* Should only be called after PrepareForIncrementalBackup. */
|
|
Assert(ib->buf.data == NULL);
|
|
|
|
/*
|
|
* dboid could be InvalidOid if shared rel, but spcoid and relfilenumber
|
|
* should have legal values.
|
|
*/
|
|
Assert(OidIsValid(spcoid));
|
|
Assert(RelFileNumberIsValid(relfilenumber));
|
|
|
|
/*
|
|
* If the file size is too large or not a multiple of BLCKSZ, then
|
|
* something weird is happening, so give up and send the whole file.
|
|
*/
|
|
if ((size % BLCKSZ) != 0 || size / BLCKSZ > RELSEG_SIZE)
|
|
return BACK_UP_FILE_FULLY;
|
|
|
|
/*
|
|
* The free-space map fork is not properly WAL-logged, so we need to
|
|
* backup the entire file every time.
|
|
*/
|
|
if (forknum == FSM_FORKNUM)
|
|
return BACK_UP_FILE_FULLY;
|
|
|
|
/*
|
|
* If this file was not part of the prior backup, back it up fully.
|
|
*
|
|
* If this file was created after the prior backup and before the start of
|
|
* the current backup, then the WAL summary information will tell us to
|
|
* back up the whole file. However, if this file was created after the
|
|
* start of the current backup, then the WAL summary won't know anything
|
|
* about it. Without this logic, we would erroneously conclude that it was
|
|
* OK to send it incrementally.
|
|
*
|
|
* Note that the file could have existed at the time of the prior backup,
|
|
* gotten deleted, and then a new file with the same name could have been
|
|
* created. In that case, this logic won't prevent the file from being
|
|
* backed up incrementally. But, if the deletion happened before the start
|
|
* of the current backup, the limit block will be 0, inducing a full
|
|
* backup. If the deletion happened after the start of the current backup,
|
|
* reconstruction will erroneously combine blocks from the current
|
|
* lifespan of the file with blocks from the previous lifespan -- but in
|
|
* this type of case, WAL replay to reach backup consistency should remove
|
|
* and recreate the file anyway, so the initial bogus contents should not
|
|
* matter.
|
|
*/
|
|
if (backup_file_lookup(ib->manifest_files, path) == NULL)
|
|
{
|
|
char *ipath;
|
|
|
|
ipath = GetIncrementalFilePath(dboid, spcoid, relfilenumber,
|
|
forknum, segno);
|
|
if (backup_file_lookup(ib->manifest_files, ipath) == NULL)
|
|
return BACK_UP_FILE_FULLY;
|
|
}
|
|
|
|
/*
|
|
* Look up the special block reference table entry for the database as a
|
|
* whole.
|
|
*/
|
|
rlocator.spcOid = spcoid;
|
|
rlocator.dbOid = dboid;
|
|
rlocator.relNumber = 0;
|
|
if (BlockRefTableGetEntry(ib->brtab, &rlocator, MAIN_FORKNUM,
|
|
&limit_block) != NULL)
|
|
{
|
|
/*
|
|
* According to the WAL summary, this database OID/tablespace OID
|
|
* pairing has been created since the previous backup. So, everything
|
|
* in it must be backed up fully.
|
|
*/
|
|
return BACK_UP_FILE_FULLY;
|
|
}
|
|
|
|
/* Look up the block reference table entry for this relfilenode. */
|
|
rlocator.relNumber = relfilenumber;
|
|
brtentry = BlockRefTableGetEntry(ib->brtab, &rlocator, forknum,
|
|
&limit_block);
|
|
|
|
/*
|
|
* If there is no entry, then there have been no WAL-logged changes to the
|
|
* relation since the predecessor backup was taken, so we can back it up
|
|
* incrementally and need not include any modified blocks.
|
|
*
|
|
* However, if the file is zero-length, we should do a full backup,
|
|
* because an incremental file is always more than zero length, and it's
|
|
* silly to take an incremental backup when a full backup would be
|
|
* smaller.
|
|
*/
|
|
if (brtentry == NULL)
|
|
{
|
|
if (size == 0)
|
|
return BACK_UP_FILE_FULLY;
|
|
*num_blocks_required = 0;
|
|
*truncation_block_length = size / BLCKSZ;
|
|
return BACK_UP_FILE_INCREMENTALLY;
|
|
}
|
|
|
|
/*
|
|
* If the limit_block is less than or equal to the point where this
|
|
* segment starts, send the whole file.
|
|
*/
|
|
if (limit_block <= segno * RELSEG_SIZE)
|
|
return BACK_UP_FILE_FULLY;
|
|
|
|
/*
|
|
* Get relevant entries from the block reference table entry.
|
|
*
|
|
* We shouldn't overflow computing the start or stop block numbers, but if
|
|
* it manages to happen somehow, detect it and throw an error.
|
|
*/
|
|
start_blkno = segno * RELSEG_SIZE;
|
|
stop_blkno = start_blkno + (size / BLCKSZ);
|
|
if (start_blkno / RELSEG_SIZE != segno || stop_blkno < start_blkno)
|
|
ereport(ERROR,
|
|
errcode(ERRCODE_INTERNAL_ERROR),
|
|
errmsg_internal("overflow computing block number bounds for segment %u with size %zu",
|
|
segno, size));
|
|
|
|
/*
|
|
* This will write *absolute* block numbers into the output array, but
|
|
* we'll transpose them below.
|
|
*/
|
|
nblocks = BlockRefTableEntryGetBlocks(brtentry, start_blkno, stop_blkno,
|
|
relative_block_numbers, RELSEG_SIZE);
|
|
Assert(nblocks <= RELSEG_SIZE);
|
|
|
|
/*
|
|
* If we're going to have to send nearly all of the blocks, then just send
|
|
* the whole file, because that won't require much extra storage or
|
|
* transfer and will speed up and simplify backup restoration. It's not
|
|
* clear what threshold is most appropriate here and perhaps it ought to
|
|
* be configurable, but for now we're just going to say that if we'd need
|
|
* to send 90% of the blocks anyway, give up and send the whole file.
|
|
*
|
|
* NB: If you change the threshold here, at least make sure to back up the
|
|
* file fully when every single block must be sent, because there's
|
|
* nothing good about sending an incremental file in that case.
|
|
*/
|
|
if (nblocks * BLCKSZ > size * 0.9)
|
|
return BACK_UP_FILE_FULLY;
|
|
|
|
/*
|
|
* Looks like we can send an incremental file, so sort the block numbers
|
|
* and then transpose them from absolute block numbers to relative block
|
|
* numbers if necessary.
|
|
*
|
|
* NB: If the block reference table was using the bitmap representation
|
|
* for a given chunk, the block numbers in that chunk will already be
|
|
* sorted, but when the array-of-offsets representation is used, we can
|
|
* receive block numbers here out of order.
|
|
*/
|
|
qsort(relative_block_numbers, nblocks, sizeof(BlockNumber),
|
|
compare_block_numbers);
|
|
if (start_blkno != 0)
|
|
{
|
|
for (i = 0; i < nblocks; ++i)
|
|
relative_block_numbers[i] -= start_blkno;
|
|
}
|
|
*num_blocks_required = nblocks;
|
|
|
|
/*
|
|
* The truncation block length is the minimum length of the reconstructed
|
|
* file. Any block numbers below this threshold that are not present in
|
|
* the backup need to be fetched from the prior backup. At or above this
|
|
* threshold, blocks should only be included in the result if they are
|
|
* present in the backup. (This may require inserting zero blocks if the
|
|
* blocks included in the backup are non-consecutive.)
|
|
*/
|
|
*truncation_block_length = size / BLCKSZ;
|
|
if (BlockNumberIsValid(limit_block))
|
|
{
|
|
unsigned relative_limit = limit_block - segno * RELSEG_SIZE;
|
|
|
|
if (*truncation_block_length < relative_limit)
|
|
*truncation_block_length = relative_limit;
|
|
}
|
|
|
|
/* Send it incrementally. */
|
|
return BACK_UP_FILE_INCREMENTALLY;
|
|
}
|
|
|
|
/*
|
|
* Compute the size for a header of an incremental file containing a given
|
|
* number of blocks. The header is rounded to a multiple of BLCKSZ, but
|
|
* only if the file will store some block data.
|
|
*/
|
|
extern size_t
|
|
GetIncrementalHeaderSize(unsigned num_blocks_required)
|
|
{
|
|
size_t result;
|
|
|
|
/* Make sure we're not going to overflow. */
|
|
Assert(num_blocks_required <= RELSEG_SIZE);
|
|
|
|
/*
|
|
* Three four byte quantities (magic number, truncation block length,
|
|
* block count) followed by block numbers.
|
|
*/
|
|
result = 3 * sizeof(uint32) + (sizeof(BlockNumber) * num_blocks_required);
|
|
|
|
/*
|
|
* Round the header size to a multiple of BLCKSZ - when not a multiple of
|
|
* BLCKSZ, add the missing fraction of a block. But do this only if the
|
|
* file will store data for some blocks, otherwise keep it small.
|
|
*/
|
|
if ((num_blocks_required > 0) && (result % BLCKSZ != 0))
|
|
result += BLCKSZ - (result % BLCKSZ);
|
|
|
|
return result;
|
|
}
|
|
|
|
/*
|
|
* Compute the size for an incremental file containing a given number of blocks.
|
|
*/
|
|
extern size_t
|
|
GetIncrementalFileSize(unsigned num_blocks_required)
|
|
{
|
|
size_t result;
|
|
|
|
/* Make sure we're not going to overflow. */
|
|
Assert(num_blocks_required <= RELSEG_SIZE);
|
|
|
|
/*
|
|
* Header with three four byte quantities (magic number, truncation block
|
|
* length, block count) followed by block numbers, rounded to a multiple
|
|
* of BLCKSZ (for files with block data), followed by block contents.
|
|
*/
|
|
result = GetIncrementalHeaderSize(num_blocks_required);
|
|
result += BLCKSZ * num_blocks_required;
|
|
|
|
return result;
|
|
}
|
|
|
|
/*
|
|
* Helper function for filemap hash table.
|
|
*/
|
|
static uint32
|
|
hash_string_pointer(const char *s)
|
|
{
|
|
unsigned char *ss = (unsigned char *) s;
|
|
|
|
return hash_bytes(ss, strlen(s));
|
|
}
|
|
|
|
/*
|
|
* This callback to validate the manifest version for incremental backup.
|
|
*/
|
|
static void
|
|
manifest_process_version(JsonManifestParseContext *context,
|
|
int manifest_version)
|
|
{
|
|
/* Incremental backups don't work with manifest version 1 */
|
|
if (manifest_version == 1)
|
|
context->error_cb(context,
|
|
"backup manifest version 1 does not support incremental backup");
|
|
}
|
|
|
|
/*
|
|
* This callback to validate the manifest system identifier against the current
|
|
* database server.
|
|
*/
|
|
static void
|
|
manifest_process_system_identifier(JsonManifestParseContext *context,
|
|
uint64 manifest_system_identifier)
|
|
{
|
|
uint64 system_identifier;
|
|
|
|
/* Get system identifier of current system */
|
|
system_identifier = GetSystemIdentifier();
|
|
|
|
if (manifest_system_identifier != system_identifier)
|
|
context->error_cb(context,
|
|
"manifest system identifier is %llu, but database system identifier is %llu",
|
|
(unsigned long long) manifest_system_identifier,
|
|
(unsigned long long) system_identifier);
|
|
}
|
|
|
|
/*
|
|
* This callback is invoked for each file mentioned in the backup manifest.
|
|
*
|
|
* We store the path to each file and the size of each file for sanity-checking
|
|
* purposes. For further details, see comments for IncrementalBackupInfo.
|
|
*/
|
|
static void
|
|
manifest_process_file(JsonManifestParseContext *context,
|
|
char *pathname, size_t size,
|
|
pg_checksum_type checksum_type,
|
|
int checksum_length,
|
|
uint8 *checksum_payload)
|
|
{
|
|
IncrementalBackupInfo *ib = context->private_data;
|
|
backup_file_entry *entry;
|
|
bool found;
|
|
|
|
entry = backup_file_insert(ib->manifest_files, pathname, &found);
|
|
if (!found)
|
|
{
|
|
entry->path = MemoryContextStrdup(ib->manifest_files->ctx,
|
|
pathname);
|
|
entry->size = size;
|
|
}
|
|
}
|
|
|
|
/*
|
|
* This callback is invoked for each WAL range mentioned in the backup
|
|
* manifest.
|
|
*
|
|
* We're just interested in learning the oldest LSN and the corresponding TLI
|
|
* that appear in any WAL range.
|
|
*/
|
|
static void
|
|
manifest_process_wal_range(JsonManifestParseContext *context,
|
|
TimeLineID tli, XLogRecPtr start_lsn,
|
|
XLogRecPtr end_lsn)
|
|
{
|
|
IncrementalBackupInfo *ib = context->private_data;
|
|
backup_wal_range *range = palloc(sizeof(backup_wal_range));
|
|
|
|
range->tli = tli;
|
|
range->start_lsn = start_lsn;
|
|
range->end_lsn = end_lsn;
|
|
ib->manifest_wal_ranges = lappend(ib->manifest_wal_ranges, range);
|
|
}
|
|
|
|
/*
|
|
* This callback is invoked if an error occurs while parsing the backup
|
|
* manifest.
|
|
*/
|
|
static void
|
|
manifest_report_error(JsonManifestParseContext *context, const char *fmt,...)
|
|
{
|
|
StringInfoData errbuf;
|
|
|
|
initStringInfo(&errbuf);
|
|
|
|
for (;;)
|
|
{
|
|
va_list ap;
|
|
int needed;
|
|
|
|
va_start(ap, fmt);
|
|
needed = appendStringInfoVA(&errbuf, fmt, ap);
|
|
va_end(ap);
|
|
if (needed == 0)
|
|
break;
|
|
enlargeStringInfo(&errbuf, needed);
|
|
}
|
|
|
|
ereport(ERROR,
|
|
errmsg_internal("%s", errbuf.data));
|
|
}
|
|
|
|
/*
|
|
* Quicksort comparator for block numbers.
|
|
*/
|
|
static int
|
|
compare_block_numbers(const void *a, const void *b)
|
|
{
|
|
BlockNumber aa = *(BlockNumber *) a;
|
|
BlockNumber bb = *(BlockNumber *) b;
|
|
|
|
return pg_cmp_u32(aa, bb);
|
|
}
|