mirror of
https://github.com/postgres/postgres.git
synced 2025-04-27 22:56:53 +03:00
Track block level checksum failures in pg_stat_database
This adds a column that counts how many checksum failures have occurred on files belonging to a specific database. Both checksum failures during normal backend processing and those created when a base backup detects a checksum failure are counted. Author: Magnus Hagander Reviewed by: Julien Rouhaud
This commit is contained in:
parent
3c5926301a
commit
6b9e875f72
@ -2508,6 +2508,11 @@ SELECT pid, wait_event_type, wait_event FROM pg_stat_activity WHERE wait_event i
|
|||||||
<entry><type>bigint</type></entry>
|
<entry><type>bigint</type></entry>
|
||||||
<entry>Number of deadlocks detected in this database</entry>
|
<entry>Number of deadlocks detected in this database</entry>
|
||||||
</row>
|
</row>
|
||||||
|
<row>
|
||||||
|
<entry><structfield>checksum_failures</structfield></entry>
|
||||||
|
<entry><type>bigint</type></entry>
|
||||||
|
<entry>Number of data page checksum failures detected in this database</entry>
|
||||||
|
</row>
|
||||||
<row>
|
<row>
|
||||||
<entry><structfield>blk_read_time</structfield></entry>
|
<entry><structfield>blk_read_time</structfield></entry>
|
||||||
<entry><type>double precision</type></entry>
|
<entry><type>double precision</type></entry>
|
||||||
|
@ -823,6 +823,7 @@ CREATE VIEW pg_stat_database AS
|
|||||||
pg_stat_get_db_temp_files(D.oid) AS temp_files,
|
pg_stat_get_db_temp_files(D.oid) AS temp_files,
|
||||||
pg_stat_get_db_temp_bytes(D.oid) AS temp_bytes,
|
pg_stat_get_db_temp_bytes(D.oid) AS temp_bytes,
|
||||||
pg_stat_get_db_deadlocks(D.oid) AS deadlocks,
|
pg_stat_get_db_deadlocks(D.oid) AS deadlocks,
|
||||||
|
pg_stat_get_db_checksum_failures(D.oid) AS checksum_failures,
|
||||||
pg_stat_get_db_blk_read_time(D.oid) AS blk_read_time,
|
pg_stat_get_db_blk_read_time(D.oid) AS blk_read_time,
|
||||||
pg_stat_get_db_blk_write_time(D.oid) AS blk_write_time,
|
pg_stat_get_db_blk_write_time(D.oid) AS blk_write_time,
|
||||||
pg_stat_get_db_stat_reset_time(D.oid) AS stats_reset
|
pg_stat_get_db_stat_reset_time(D.oid) AS stats_reset
|
||||||
|
@ -334,6 +334,7 @@ static void pgstat_recv_funcstat(PgStat_MsgFuncstat *msg, int len);
|
|||||||
static void pgstat_recv_funcpurge(PgStat_MsgFuncpurge *msg, int len);
|
static void pgstat_recv_funcpurge(PgStat_MsgFuncpurge *msg, int len);
|
||||||
static void pgstat_recv_recoveryconflict(PgStat_MsgRecoveryConflict *msg, int len);
|
static void pgstat_recv_recoveryconflict(PgStat_MsgRecoveryConflict *msg, int len);
|
||||||
static void pgstat_recv_deadlock(PgStat_MsgDeadlock *msg, int len);
|
static void pgstat_recv_deadlock(PgStat_MsgDeadlock *msg, int len);
|
||||||
|
static void pgstat_recv_checksum_failure(PgStat_MsgChecksumFailure *msg, int len);
|
||||||
static void pgstat_recv_tempfile(PgStat_MsgTempFile *msg, int len);
|
static void pgstat_recv_tempfile(PgStat_MsgTempFile *msg, int len);
|
||||||
|
|
||||||
/* ------------------------------------------------------------
|
/* ------------------------------------------------------------
|
||||||
@ -1518,6 +1519,40 @@ pgstat_report_deadlock(void)
|
|||||||
pgstat_send(&msg, sizeof(msg));
|
pgstat_send(&msg, sizeof(msg));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/* --------
|
||||||
|
* pgstat_report_checksum_failures_in_db(dboid, failure_count) -
|
||||||
|
*
|
||||||
|
* Tell the collector about one or more checksum failures.
|
||||||
|
* --------
|
||||||
|
*/
|
||||||
|
void
|
||||||
|
pgstat_report_checksum_failures_in_db(Oid dboid, int failurecount)
|
||||||
|
{
|
||||||
|
PgStat_MsgChecksumFailure msg;
|
||||||
|
|
||||||
|
if (pgStatSock == PGINVALID_SOCKET || !pgstat_track_counts)
|
||||||
|
return;
|
||||||
|
|
||||||
|
pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_CHECKSUMFAILURE);
|
||||||
|
msg.m_databaseid = dboid;
|
||||||
|
msg.m_failurecount = failurecount;
|
||||||
|
pgstat_send(&msg, sizeof(msg));
|
||||||
|
}
|
||||||
|
|
||||||
|
/* --------
|
||||||
|
* pgstat_report_checksum_failure() -
|
||||||
|
*
|
||||||
|
* Tell the collector about a checksum failure.
|
||||||
|
* --------
|
||||||
|
*/
|
||||||
|
void
|
||||||
|
pgstat_report_checksum_failure(void)
|
||||||
|
{
|
||||||
|
pgstat_report_checksum_failures_in_db(MyDatabaseId, 1);
|
||||||
|
}
|
||||||
|
|
||||||
/* --------
|
/* --------
|
||||||
* pgstat_report_tempfile() -
|
* pgstat_report_tempfile() -
|
||||||
*
|
*
|
||||||
@ -4455,6 +4490,10 @@ PgstatCollectorMain(int argc, char *argv[])
|
|||||||
pgstat_recv_tempfile((PgStat_MsgTempFile *) &msg, len);
|
pgstat_recv_tempfile((PgStat_MsgTempFile *) &msg, len);
|
||||||
break;
|
break;
|
||||||
|
|
||||||
|
case PGSTAT_MTYPE_CHECKSUMFAILURE:
|
||||||
|
pgstat_recv_checksum_failure((PgStat_MsgChecksumFailure *) &msg, len);
|
||||||
|
break;
|
||||||
|
|
||||||
default:
|
default:
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
@ -4554,6 +4593,7 @@ reset_dbentry_counters(PgStat_StatDBEntry *dbentry)
|
|||||||
dbentry->n_temp_files = 0;
|
dbentry->n_temp_files = 0;
|
||||||
dbentry->n_temp_bytes = 0;
|
dbentry->n_temp_bytes = 0;
|
||||||
dbentry->n_deadlocks = 0;
|
dbentry->n_deadlocks = 0;
|
||||||
|
dbentry->n_checksum_failures = 0;
|
||||||
dbentry->n_block_read_time = 0;
|
dbentry->n_block_read_time = 0;
|
||||||
dbentry->n_block_write_time = 0;
|
dbentry->n_block_write_time = 0;
|
||||||
|
|
||||||
@ -6196,6 +6236,22 @@ pgstat_recv_deadlock(PgStat_MsgDeadlock *msg, int len)
|
|||||||
dbentry->n_deadlocks++;
|
dbentry->n_deadlocks++;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* ----------
|
||||||
|
* pgstat_recv_checksum_failure() -
|
||||||
|
*
|
||||||
|
* Process a CHECKSUMFAILURE message.
|
||||||
|
* ----------
|
||||||
|
*/
|
||||||
|
static void
|
||||||
|
pgstat_recv_checksum_failure(PgStat_MsgChecksumFailure *msg, int len)
|
||||||
|
{
|
||||||
|
PgStat_StatDBEntry *dbentry;
|
||||||
|
|
||||||
|
dbentry = pgstat_get_db_entry(msg->m_databaseid, true);
|
||||||
|
|
||||||
|
dbentry->n_checksum_failures += msg->m_failurecount;
|
||||||
|
}
|
||||||
|
|
||||||
/* ----------
|
/* ----------
|
||||||
* pgstat_recv_tempfile() -
|
* pgstat_recv_tempfile() -
|
||||||
*
|
*
|
||||||
|
@ -58,7 +58,7 @@ typedef struct
|
|||||||
static int64 sendDir(const char *path, int basepathlen, bool sizeonly,
|
static int64 sendDir(const char *path, int basepathlen, bool sizeonly,
|
||||||
List *tablespaces, bool sendtblspclinks);
|
List *tablespaces, bool sendtblspclinks);
|
||||||
static bool sendFile(const char *readfilename, const char *tarfilename,
|
static bool sendFile(const char *readfilename, const char *tarfilename,
|
||||||
struct stat *statbuf, bool missing_ok);
|
struct stat *statbuf, bool missing_ok, Oid dboid);
|
||||||
static void sendFileWithContent(const char *filename, const char *content);
|
static void sendFileWithContent(const char *filename, const char *content);
|
||||||
static int64 _tarWriteHeader(const char *filename, const char *linktarget,
|
static int64 _tarWriteHeader(const char *filename, const char *linktarget,
|
||||||
struct stat *statbuf, bool sizeonly);
|
struct stat *statbuf, bool sizeonly);
|
||||||
@ -342,7 +342,7 @@ perform_base_backup(basebackup_options *opt)
|
|||||||
(errcode_for_file_access(),
|
(errcode_for_file_access(),
|
||||||
errmsg("could not stat file \"%s\": %m",
|
errmsg("could not stat file \"%s\": %m",
|
||||||
XLOG_CONTROL_FILE)));
|
XLOG_CONTROL_FILE)));
|
||||||
sendFile(XLOG_CONTROL_FILE, XLOG_CONTROL_FILE, &statbuf, false);
|
sendFile(XLOG_CONTROL_FILE, XLOG_CONTROL_FILE, &statbuf, false, InvalidOid);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
sendTablespace(ti->path, false);
|
sendTablespace(ti->path, false);
|
||||||
@ -592,7 +592,7 @@ perform_base_backup(basebackup_options *opt)
|
|||||||
(errcode_for_file_access(),
|
(errcode_for_file_access(),
|
||||||
errmsg("could not stat file \"%s\": %m", pathbuf)));
|
errmsg("could not stat file \"%s\": %m", pathbuf)));
|
||||||
|
|
||||||
sendFile(pathbuf, pathbuf, &statbuf, false);
|
sendFile(pathbuf, pathbuf, &statbuf, false, InvalidOid);
|
||||||
|
|
||||||
/* unconditionally mark file as archived */
|
/* unconditionally mark file as archived */
|
||||||
StatusFilePath(pathbuf, fname, ".done");
|
StatusFilePath(pathbuf, fname, ".done");
|
||||||
@ -1302,7 +1302,7 @@ sendDir(const char *path, int basepathlen, bool sizeonly, List *tablespaces,
|
|||||||
|
|
||||||
if (!sizeonly)
|
if (!sizeonly)
|
||||||
sent = sendFile(pathbuf, pathbuf + basepathlen + 1, &statbuf,
|
sent = sendFile(pathbuf, pathbuf + basepathlen + 1, &statbuf,
|
||||||
true);
|
true, isDbDir ? pg_atoi(lastDir + 1, sizeof(Oid), 0) : InvalidOid);
|
||||||
|
|
||||||
if (sent || sizeonly)
|
if (sent || sizeonly)
|
||||||
{
|
{
|
||||||
@ -1358,12 +1358,15 @@ is_checksummed_file(const char *fullpath, const char *filename)
|
|||||||
*
|
*
|
||||||
* If 'missing_ok' is true, will not throw an error if the file is not found.
|
* If 'missing_ok' is true, will not throw an error if the file is not found.
|
||||||
*
|
*
|
||||||
|
* If dboid is anything other than InvalidOid then any checksum failures detected
|
||||||
|
* will get reported to the stats collector.
|
||||||
|
*
|
||||||
* Returns true if the file was successfully sent, false if 'missing_ok',
|
* Returns true if the file was successfully sent, false if 'missing_ok',
|
||||||
* and the file did not exist.
|
* and the file did not exist.
|
||||||
*/
|
*/
|
||||||
static bool
|
static bool
|
||||||
sendFile(const char *readfilename, const char *tarfilename, struct stat *statbuf,
|
sendFile(const char *readfilename, const char *tarfilename, struct stat *statbuf,
|
||||||
bool missing_ok)
|
bool missing_ok, Oid dboid)
|
||||||
{
|
{
|
||||||
FILE *fp;
|
FILE *fp;
|
||||||
BlockNumber blkno = 0;
|
BlockNumber blkno = 0;
|
||||||
@ -1580,6 +1583,9 @@ sendFile(const char *readfilename, const char *tarfilename, struct stat *statbuf
|
|||||||
ereport(WARNING,
|
ereport(WARNING,
|
||||||
(errmsg("file \"%s\" has a total of %d checksum verification "
|
(errmsg("file \"%s\" has a total of %d checksum verification "
|
||||||
"failures", readfilename, checksum_failures)));
|
"failures", readfilename, checksum_failures)));
|
||||||
|
|
||||||
|
if (dboid != InvalidOid)
|
||||||
|
pgstat_report_checksum_failures_in_db(dboid, checksum_failures);
|
||||||
}
|
}
|
||||||
total_checksum_failures += checksum_failures;
|
total_checksum_failures += checksum_failures;
|
||||||
|
|
||||||
|
@ -17,6 +17,7 @@
|
|||||||
#include "access/htup_details.h"
|
#include "access/htup_details.h"
|
||||||
#include "access/itup.h"
|
#include "access/itup.h"
|
||||||
#include "access/xlog.h"
|
#include "access/xlog.h"
|
||||||
|
#include "pgstat.h"
|
||||||
#include "storage/checksum.h"
|
#include "storage/checksum.h"
|
||||||
#include "utils/memdebug.h"
|
#include "utils/memdebug.h"
|
||||||
#include "utils/memutils.h"
|
#include "utils/memutils.h"
|
||||||
@ -151,6 +152,8 @@ PageIsVerified(Page page, BlockNumber blkno)
|
|||||||
errmsg("page verification failed, calculated checksum %u but expected %u",
|
errmsg("page verification failed, calculated checksum %u but expected %u",
|
||||||
checksum, p->pd_checksum)));
|
checksum, p->pd_checksum)));
|
||||||
|
|
||||||
|
pgstat_report_checksum_failure();
|
||||||
|
|
||||||
if (header_sane && ignore_checksum_failure)
|
if (header_sane && ignore_checksum_failure)
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
@ -1497,6 +1497,21 @@ pg_stat_get_db_deadlocks(PG_FUNCTION_ARGS)
|
|||||||
PG_RETURN_INT64(result);
|
PG_RETURN_INT64(result);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Datum
|
||||||
|
pg_stat_get_db_checksum_failures(PG_FUNCTION_ARGS)
|
||||||
|
{
|
||||||
|
Oid dbid = PG_GETARG_OID(0);
|
||||||
|
int64 result;
|
||||||
|
PgStat_StatDBEntry *dbentry;
|
||||||
|
|
||||||
|
if ((dbentry = pgstat_fetch_stat_dbentry(dbid)) == NULL)
|
||||||
|
result = 0;
|
||||||
|
else
|
||||||
|
result = (int64) (dbentry->n_checksum_failures);
|
||||||
|
|
||||||
|
PG_RETURN_INT64(result);
|
||||||
|
}
|
||||||
|
|
||||||
Datum
|
Datum
|
||||||
pg_stat_get_db_blk_read_time(PG_FUNCTION_ARGS)
|
pg_stat_get_db_blk_read_time(PG_FUNCTION_ARGS)
|
||||||
{
|
{
|
||||||
|
@ -53,6 +53,6 @@
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
/* yyyymmddN */
|
/* yyyymmddN */
|
||||||
#define CATALOG_VERSION_NO 201903063
|
#define CATALOG_VERSION_NO 201903091
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
@ -5227,6 +5227,10 @@
|
|||||||
proname => 'pg_stat_get_db_deadlocks', provolatile => 's', proparallel => 'r',
|
proname => 'pg_stat_get_db_deadlocks', provolatile => 's', proparallel => 'r',
|
||||||
prorettype => 'int8', proargtypes => 'oid',
|
prorettype => 'int8', proargtypes => 'oid',
|
||||||
prosrc => 'pg_stat_get_db_deadlocks' },
|
prosrc => 'pg_stat_get_db_deadlocks' },
|
||||||
|
{ oid => '3426', descr => 'statistics: checksum failures detected in database',
|
||||||
|
proname => 'pg_stat_get_db_checksum_failures', provolatile => 's', proparallel => 'r',
|
||||||
|
prorettype => 'int8', proargtypes => 'oid',
|
||||||
|
prosrc => 'pg_stat_get_db_checksum_failures' },
|
||||||
{ oid => '3074', descr => 'statistics: last reset for a database',
|
{ oid => '3074', descr => 'statistics: last reset for a database',
|
||||||
proname => 'pg_stat_get_db_stat_reset_time', provolatile => 's',
|
proname => 'pg_stat_get_db_stat_reset_time', provolatile => 's',
|
||||||
proparallel => 'r', prorettype => 'timestamptz', proargtypes => 'oid',
|
proparallel => 'r', prorettype => 'timestamptz', proargtypes => 'oid',
|
||||||
|
@ -64,7 +64,8 @@ typedef enum StatMsgType
|
|||||||
PGSTAT_MTYPE_FUNCPURGE,
|
PGSTAT_MTYPE_FUNCPURGE,
|
||||||
PGSTAT_MTYPE_RECOVERYCONFLICT,
|
PGSTAT_MTYPE_RECOVERYCONFLICT,
|
||||||
PGSTAT_MTYPE_TEMPFILE,
|
PGSTAT_MTYPE_TEMPFILE,
|
||||||
PGSTAT_MTYPE_DEADLOCK
|
PGSTAT_MTYPE_DEADLOCK,
|
||||||
|
PGSTAT_MTYPE_CHECKSUMFAILURE
|
||||||
} StatMsgType;
|
} StatMsgType;
|
||||||
|
|
||||||
/* ----------
|
/* ----------
|
||||||
@ -530,6 +531,18 @@ typedef struct PgStat_MsgDeadlock
|
|||||||
Oid m_databaseid;
|
Oid m_databaseid;
|
||||||
} PgStat_MsgDeadlock;
|
} PgStat_MsgDeadlock;
|
||||||
|
|
||||||
|
/* ----------
|
||||||
|
* PgStat_MsgChecksumFailure Sent by the backend to tell the collector
|
||||||
|
* about checksum failures noticed.
|
||||||
|
* ----------
|
||||||
|
*/
|
||||||
|
typedef struct PgStat_MsgChecksumFailure
|
||||||
|
{
|
||||||
|
PgStat_MsgHdr m_hdr;
|
||||||
|
Oid m_databaseid;
|
||||||
|
int m_failurecount;
|
||||||
|
} PgStat_MsgChecksumFailure;
|
||||||
|
|
||||||
|
|
||||||
/* ----------
|
/* ----------
|
||||||
* PgStat_Msg Union over all possible messages.
|
* PgStat_Msg Union over all possible messages.
|
||||||
@ -593,6 +606,7 @@ typedef struct PgStat_StatDBEntry
|
|||||||
PgStat_Counter n_temp_files;
|
PgStat_Counter n_temp_files;
|
||||||
PgStat_Counter n_temp_bytes;
|
PgStat_Counter n_temp_bytes;
|
||||||
PgStat_Counter n_deadlocks;
|
PgStat_Counter n_deadlocks;
|
||||||
|
PgStat_Counter n_checksum_failures;
|
||||||
PgStat_Counter n_block_read_time; /* times in microseconds */
|
PgStat_Counter n_block_read_time; /* times in microseconds */
|
||||||
PgStat_Counter n_block_write_time;
|
PgStat_Counter n_block_write_time;
|
||||||
|
|
||||||
@ -1200,6 +1214,8 @@ extern void pgstat_report_analyze(Relation rel,
|
|||||||
|
|
||||||
extern void pgstat_report_recovery_conflict(int reason);
|
extern void pgstat_report_recovery_conflict(int reason);
|
||||||
extern void pgstat_report_deadlock(void);
|
extern void pgstat_report_deadlock(void);
|
||||||
|
extern void pgstat_report_checksum_failures_in_db(Oid dboid, int failurecount);
|
||||||
|
extern void pgstat_report_checksum_failure(void);
|
||||||
|
|
||||||
extern void pgstat_initialize(void);
|
extern void pgstat_initialize(void);
|
||||||
extern void pgstat_bestart(void);
|
extern void pgstat_bestart(void);
|
||||||
|
@ -1817,6 +1817,7 @@ pg_stat_database| SELECT d.oid AS datid,
|
|||||||
pg_stat_get_db_temp_files(d.oid) AS temp_files,
|
pg_stat_get_db_temp_files(d.oid) AS temp_files,
|
||||||
pg_stat_get_db_temp_bytes(d.oid) AS temp_bytes,
|
pg_stat_get_db_temp_bytes(d.oid) AS temp_bytes,
|
||||||
pg_stat_get_db_deadlocks(d.oid) AS deadlocks,
|
pg_stat_get_db_deadlocks(d.oid) AS deadlocks,
|
||||||
|
pg_stat_get_db_checksum_failures(d.oid) AS checksum_failures,
|
||||||
pg_stat_get_db_blk_read_time(d.oid) AS blk_read_time,
|
pg_stat_get_db_blk_read_time(d.oid) AS blk_read_time,
|
||||||
pg_stat_get_db_blk_write_time(d.oid) AS blk_write_time,
|
pg_stat_get_db_blk_write_time(d.oid) AS blk_write_time,
|
||||||
pg_stat_get_db_stat_reset_time(d.oid) AS stats_reset
|
pg_stat_get_db_stat_reset_time(d.oid) AS stats_reset
|
||||||
|
Loading…
x
Reference in New Issue
Block a user