1
0
mirror of https://github.com/postgres/postgres.git synced 2025-09-02 04:21:28 +03:00

Validate page level checksums in base backups

When base backups are run over the replication protocol (for example
using pg_basebackup), verify the checksums of all data blocks if
checksums are enabled. If checksum failures are encountered, log them
as warnings but don't abort the backup.

This becomes the default behaviour in pg_basebackup (provided checksums
are enabled on the server), so add a switch (-k) to disable the checks
if necessary.

Author: Michael Banck
Reviewed-By: Magnus Hagander, David Steele
Discussion: https://postgr.es/m/20180228180856.GE13784@nighthawk.caipicrew.dd-dns.de
This commit is contained in:
Magnus Hagander
2018-04-03 13:47:16 +02:00
parent 4923550c20
commit 4eb77d50c2
7 changed files with 352 additions and 13 deletions

View File

@@ -39,6 +39,7 @@
#include "replication/basebackup.h"
#include "streamutil.h"
#define ERRCODE_DATA_CORRUPTED "XX001"
typedef struct TablespaceListCell
{
@@ -81,6 +82,7 @@ static char *xlog_dir = NULL;
static char format = 'p'; /* p(lain)/t(ar) */
static char *label = "pg_basebackup base backup";
static bool noclean = false;
static bool checksum_failure = false;
static bool showprogress = false;
static int verbose = 0;
static int compresslevel = 0;
@@ -95,6 +97,7 @@ static char *replication_slot = NULL;
static bool temp_replication_slot = true;
static bool create_slot = false;
static bool no_slot = false;
static bool verify_checksums = true;
static bool success = false;
static bool made_new_pgdata = false;
@@ -155,7 +158,7 @@ cleanup_directories_atexit(void)
if (success || in_log_streamer)
return;
if (!noclean)
if (!noclean && !checksum_failure)
{
if (made_new_pgdata)
{
@@ -195,7 +198,7 @@ cleanup_directories_atexit(void)
}
else
{
if (made_new_pgdata || found_existing_pgdata)
if ((made_new_pgdata || found_existing_pgdata) && !checksum_failure)
fprintf(stderr,
_("%s: data directory \"%s\" not removed at user's request\n"),
progname, basedir);
@@ -206,7 +209,7 @@ cleanup_directories_atexit(void)
progname, xlog_dir);
}
if (made_tablespace_dirs || found_tablespace_dirs)
if ((made_tablespace_dirs || found_tablespace_dirs) && !checksum_failure)
fprintf(stderr,
_("%s: changes to tablespace directories will not be undone\n"),
progname);
@@ -360,6 +363,8 @@ usage(void)
printf(_(" -P, --progress show progress information\n"));
printf(_(" -S, --slot=SLOTNAME replication slot to use\n"));
printf(_(" --no-slot prevent creation of temporary replication slot\n"));
printf(_(" -k, --no-verify-checksums\n"
" do not verify checksums\n"));
printf(_(" -v, --verbose output verbose messages\n"));
printf(_(" -V, --version output version information, then exit\n"));
printf(_(" -?, --help show this help, then exit\n"));
@@ -1808,14 +1813,15 @@ BaseBackup(void)
}
basebkp =
psprintf("BASE_BACKUP LABEL '%s' %s %s %s %s %s %s",
psprintf("BASE_BACKUP LABEL '%s' %s %s %s %s %s %s %s",
escaped_label,
showprogress ? "PROGRESS" : "",
includewal == FETCH_WAL ? "WAL" : "",
fastcheckpoint ? "FAST" : "",
includewal == NO_WAL ? "" : "NOWAIT",
maxrate_clause ? maxrate_clause : "",
format == 't' ? "TABLESPACE_MAP" : "");
format == 't' ? "TABLESPACE_MAP" : "",
verify_checksums ? "" : "NOVERIFY_CHECKSUMS");
if (PQsendQuery(conn, basebkp) == 0)
{
@@ -1970,8 +1976,20 @@ BaseBackup(void)
res = PQgetResult(conn);
if (PQresultStatus(res) != PGRES_COMMAND_OK)
{
fprintf(stderr, _("%s: final receive failed: %s"),
progname, PQerrorMessage(conn));
const char *sqlstate = PQresultErrorField(res, PG_DIAG_SQLSTATE);
if (sqlstate &&
strcmp(sqlstate, ERRCODE_DATA_CORRUPTED) == 0)
{
fprintf(stderr, _("%s: checksum error occured\n"),
progname);
checksum_failure = true;
}
else
{
fprintf(stderr, _("%s: final receive failed: %s"),
progname, PQerrorMessage(conn));
}
disconnect_and_exit(1);
}
@@ -2140,6 +2158,7 @@ main(int argc, char **argv)
{"progress", no_argument, NULL, 'P'},
{"waldir", required_argument, NULL, 1},
{"no-slot", no_argument, NULL, 2},
{"no-verify-checksums", no_argument, NULL, 'k'},
{NULL, 0, NULL, 0}
};
int c;
@@ -2166,7 +2185,7 @@ main(int argc, char **argv)
atexit(cleanup_directories_atexit);
while ((c = getopt_long(argc, argv, "CD:F:r:RS:T:X:l:nNzZ:d:c:h:p:U:s:wWvP",
while ((c = getopt_long(argc, argv, "CD:F:r:RS:T:X:l:nNzZ:d:c:h:p:U:s:wWkvP",
long_options, &option_index)) != -1)
{
switch (c)
@@ -2308,6 +2327,9 @@ main(int argc, char **argv)
case 'P':
showprogress = true;
break;
case 'k':
verify_checksums = false;
break;
default:
/*

View File

@@ -5,7 +5,7 @@ use Config;
use File::Basename qw(basename dirname);
use PostgresNode;
use TestLib;
use Test::More tests => 93;
use Test::More tests => 104;
program_help_ok('pg_basebackup');
program_version_ok('pg_basebackup');
@@ -16,7 +16,7 @@ my $tempdir = TestLib::tempdir;
my $node = get_new_node('main');
# Initialize node without replication settings
$node->init;
$node->init(extra => [ '--data-checksums' ]);
$node->start;
my $pgdata = $node->data_dir;
@@ -402,3 +402,61 @@ like(
slurp_file("$tempdir/backupxs_sl_R/recovery.conf"),
qr/^primary_slot_name = 'slot1'\n/m,
'recovery.conf sets primary_slot_name');
my $checksum = $node->safe_psql('postgres', 'SHOW data_checksums;');
is($checksum, 'on', 'checksums are enabled');
# get relfilenodes of relations to corrupt
my $pg_class = $node->safe_psql('postgres',
q{SELECT pg_relation_filepath('pg_class')}
);
my $pg_index = $node->safe_psql('postgres',
q{SELECT pg_relation_filepath('pg_index')}
);
# induce corruption
open $file, '+<', "$pgdata/$pg_class";
seek($file, 4000, 0);
syswrite($file, '\0\0\0\0\0\0\0\0\0');
close $file;
$node->command_checks_all([ 'pg_basebackup', '-D', "$tempdir/backup_corrupt"],
1,
[qr{^$}],
[qr/^WARNING.*checksum verification failed/s],
'pg_basebackup reports checksum mismatch'
);
# induce further corruption in 5 more blocks
open $file, '+<', "$pgdata/$pg_class";
my @offsets = (12192, 20384, 28576, 36768, 44960);
foreach my $offset (@offsets) {
seek($file, $offset, 0);
syswrite($file, '\0\0\0\0\0\0\0\0\0');
}
close $file;
$node->command_checks_all([ 'pg_basebackup', '-D', "$tempdir/backup_corrupt2"],
1,
[qr{^$}],
[qr/^WARNING.*further.*failures.*will.not.be.reported/s],
'pg_basebackup does not report more than 5 checksum mismatches'
);
# induce corruption in a second file
open $file, '+<', "$pgdata/$pg_index";
seek($file, 4000, 0);
syswrite($file, '\0\0\0\0\0\0\0\0\0');
close $file;
$node->command_checks_all([ 'pg_basebackup', '-D', "$tempdir/backup_corrupt3"],
1,
[qr{^$}],
[qr/^WARNING.*7 total checksum verification failures/s],
'pg_basebackup correctly report the total number of checksum mismatches'
);
# do not verify checksums, should return ok
$node->command_ok(
[ 'pg_basebackup', '-D', "$tempdir/backup_corrupt4", '-k' ],
'pg_basebackup with -k does not report checksum mismatch');