From 6b9e875f7286d8535bff7955e5aa3602e188e436 Mon Sep 17 00:00:00 2001 From: Magnus Hagander Date: Sat, 9 Mar 2019 10:45:17 -0800 Subject: [PATCH] Track block level checksum failures in pg_stat_database This adds a column that counts how many checksum failures have occurred on files belonging to a specific database. Both checksum failures during normal backend processing and those created when a base backup detects a checksum failure are counted. Author: Magnus Hagander Reviewed by: Julien Rouhaud --- doc/src/sgml/monitoring.sgml | 5 +++ src/backend/catalog/system_views.sql | 1 + src/backend/postmaster/pgstat.c | 56 ++++++++++++++++++++++++++++ src/backend/replication/basebackup.c | 16 +++++--- src/backend/storage/page/bufpage.c | 3 ++ src/backend/utils/adt/pgstatfuncs.c | 15 ++++++++ src/include/catalog/catversion.h | 2 +- src/include/catalog/pg_proc.dat | 4 ++ src/include/pgstat.h | 18 ++++++++- src/test/regress/expected/rules.out | 1 + 10 files changed, 114 insertions(+), 7 deletions(-) diff --git a/doc/src/sgml/monitoring.sgml b/doc/src/sgml/monitoring.sgml index 0e73cdcddab..e2630fd3682 100644 --- a/doc/src/sgml/monitoring.sgml +++ b/doc/src/sgml/monitoring.sgml @@ -2508,6 +2508,11 @@ SELECT pid, wait_event_type, wait_event FROM pg_stat_activity WHERE wait_event i bigint Number of deadlocks detected in this database + + checksum_failures + bigint + Number of data page checksum failures detected in this database + blk_read_time double precision diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql index 3e229c693c4..7723f013274 100644 --- a/src/backend/catalog/system_views.sql +++ b/src/backend/catalog/system_views.sql @@ -823,6 +823,7 @@ CREATE VIEW pg_stat_database AS pg_stat_get_db_temp_files(D.oid) AS temp_files, pg_stat_get_db_temp_bytes(D.oid) AS temp_bytes, pg_stat_get_db_deadlocks(D.oid) AS deadlocks, + pg_stat_get_db_checksum_failures(D.oid) AS checksum_failures, pg_stat_get_db_blk_read_time(D.oid) AS blk_read_time, pg_stat_get_db_blk_write_time(D.oid) AS blk_write_time, pg_stat_get_db_stat_reset_time(D.oid) AS stats_reset diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c index 81c64992518..43ec33834bf 100644 --- a/src/backend/postmaster/pgstat.c +++ b/src/backend/postmaster/pgstat.c @@ -334,6 +334,7 @@ static void pgstat_recv_funcstat(PgStat_MsgFuncstat *msg, int len); static void pgstat_recv_funcpurge(PgStat_MsgFuncpurge *msg, int len); static void pgstat_recv_recoveryconflict(PgStat_MsgRecoveryConflict *msg, int len); static void pgstat_recv_deadlock(PgStat_MsgDeadlock *msg, int len); +static void pgstat_recv_checksum_failure(PgStat_MsgChecksumFailure *msg, int len); static void pgstat_recv_tempfile(PgStat_MsgTempFile *msg, int len); /* ------------------------------------------------------------ @@ -1518,6 +1519,40 @@ pgstat_report_deadlock(void) pgstat_send(&msg, sizeof(msg)); } + + +/* -------- + * pgstat_report_checksum_failures_in_db(dboid, failure_count) - + * + * Tell the collector about one or more checksum failures. + * -------- + */ +void +pgstat_report_checksum_failures_in_db(Oid dboid, int failurecount) +{ + PgStat_MsgChecksumFailure msg; + + if (pgStatSock == PGINVALID_SOCKET || !pgstat_track_counts) + return; + + pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_CHECKSUMFAILURE); + msg.m_databaseid = dboid; + msg.m_failurecount = failurecount; + pgstat_send(&msg, sizeof(msg)); +} + +/* -------- + * pgstat_report_checksum_failure() - + * + * Tell the collector about a checksum failure. + * -------- + */ +void +pgstat_report_checksum_failure(void) +{ + pgstat_report_checksum_failures_in_db(MyDatabaseId, 1); +} + /* -------- * pgstat_report_tempfile() - * @@ -4455,6 +4490,10 @@ PgstatCollectorMain(int argc, char *argv[]) pgstat_recv_tempfile((PgStat_MsgTempFile *) &msg, len); break; + case PGSTAT_MTYPE_CHECKSUMFAILURE: + pgstat_recv_checksum_failure((PgStat_MsgChecksumFailure *) &msg, len); + break; + default: break; } @@ -4554,6 +4593,7 @@ reset_dbentry_counters(PgStat_StatDBEntry *dbentry) dbentry->n_temp_files = 0; dbentry->n_temp_bytes = 0; dbentry->n_deadlocks = 0; + dbentry->n_checksum_failures = 0; dbentry->n_block_read_time = 0; dbentry->n_block_write_time = 0; @@ -6196,6 +6236,22 @@ pgstat_recv_deadlock(PgStat_MsgDeadlock *msg, int len) dbentry->n_deadlocks++; } +/* ---------- + * pgstat_recv_checksum_failure() - + * + * Process a CHECKSUMFAILURE message. + * ---------- + */ +static void +pgstat_recv_checksum_failure(PgStat_MsgChecksumFailure *msg, int len) +{ + PgStat_StatDBEntry *dbentry; + + dbentry = pgstat_get_db_entry(msg->m_databaseid, true); + + dbentry->n_checksum_failures += msg->m_failurecount; +} + /* ---------- * pgstat_recv_tempfile() - * diff --git a/src/backend/replication/basebackup.c b/src/backend/replication/basebackup.c index def6c03dd0c..6c324a6661d 100644 --- a/src/backend/replication/basebackup.c +++ b/src/backend/replication/basebackup.c @@ -58,7 +58,7 @@ typedef struct static int64 sendDir(const char *path, int basepathlen, bool sizeonly, List *tablespaces, bool sendtblspclinks); static bool sendFile(const char *readfilename, const char *tarfilename, - struct stat *statbuf, bool missing_ok); + struct stat *statbuf, bool missing_ok, Oid dboid); static void sendFileWithContent(const char *filename, const char *content); static int64 _tarWriteHeader(const char *filename, const char *linktarget, struct stat *statbuf, bool sizeonly); @@ -342,7 +342,7 @@ perform_base_backup(basebackup_options *opt) (errcode_for_file_access(), errmsg("could not stat file \"%s\": %m", XLOG_CONTROL_FILE))); - sendFile(XLOG_CONTROL_FILE, XLOG_CONTROL_FILE, &statbuf, false); + sendFile(XLOG_CONTROL_FILE, XLOG_CONTROL_FILE, &statbuf, false, InvalidOid); } else sendTablespace(ti->path, false); @@ -592,7 +592,7 @@ perform_base_backup(basebackup_options *opt) (errcode_for_file_access(), errmsg("could not stat file \"%s\": %m", pathbuf))); - sendFile(pathbuf, pathbuf, &statbuf, false); + sendFile(pathbuf, pathbuf, &statbuf, false, InvalidOid); /* unconditionally mark file as archived */ StatusFilePath(pathbuf, fname, ".done"); @@ -1302,7 +1302,7 @@ sendDir(const char *path, int basepathlen, bool sizeonly, List *tablespaces, if (!sizeonly) sent = sendFile(pathbuf, pathbuf + basepathlen + 1, &statbuf, - true); + true, isDbDir ? pg_atoi(lastDir + 1, sizeof(Oid), 0) : InvalidOid); if (sent || sizeonly) { @@ -1358,12 +1358,15 @@ is_checksummed_file(const char *fullpath, const char *filename) * * If 'missing_ok' is true, will not throw an error if the file is not found. * + * If dboid is anything other than InvalidOid then any checksum failures detected + * will get reported to the stats collector. + * * Returns true if the file was successfully sent, false if 'missing_ok', * and the file did not exist. */ static bool sendFile(const char *readfilename, const char *tarfilename, struct stat *statbuf, - bool missing_ok) + bool missing_ok, Oid dboid) { FILE *fp; BlockNumber blkno = 0; @@ -1580,6 +1583,9 @@ sendFile(const char *readfilename, const char *tarfilename, struct stat *statbuf ereport(WARNING, (errmsg("file \"%s\" has a total of %d checksum verification " "failures", readfilename, checksum_failures))); + + if (dboid != InvalidOid) + pgstat_report_checksum_failures_in_db(dboid, checksum_failures); } total_checksum_failures += checksum_failures; diff --git a/src/backend/storage/page/bufpage.c b/src/backend/storage/page/bufpage.c index 517220bc334..14bc61b8ad2 100644 --- a/src/backend/storage/page/bufpage.c +++ b/src/backend/storage/page/bufpage.c @@ -17,6 +17,7 @@ #include "access/htup_details.h" #include "access/itup.h" #include "access/xlog.h" +#include "pgstat.h" #include "storage/checksum.h" #include "utils/memdebug.h" #include "utils/memutils.h" @@ -151,6 +152,8 @@ PageIsVerified(Page page, BlockNumber blkno) errmsg("page verification failed, calculated checksum %u but expected %u", checksum, p->pd_checksum))); + pgstat_report_checksum_failure(); + if (header_sane && ignore_checksum_failure) return true; } diff --git a/src/backend/utils/adt/pgstatfuncs.c b/src/backend/utils/adt/pgstatfuncs.c index 69f72657792..da1d685c081 100644 --- a/src/backend/utils/adt/pgstatfuncs.c +++ b/src/backend/utils/adt/pgstatfuncs.c @@ -1497,6 +1497,21 @@ pg_stat_get_db_deadlocks(PG_FUNCTION_ARGS) PG_RETURN_INT64(result); } +Datum +pg_stat_get_db_checksum_failures(PG_FUNCTION_ARGS) +{ + Oid dbid = PG_GETARG_OID(0); + int64 result; + PgStat_StatDBEntry *dbentry; + + if ((dbentry = pgstat_fetch_stat_dbentry(dbid)) == NULL) + result = 0; + else + result = (int64) (dbentry->n_checksum_failures); + + PG_RETURN_INT64(result); +} + Datum pg_stat_get_db_blk_read_time(PG_FUNCTION_ARGS) { diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h index ad8bde6ae3a..baf34a3b6fe 100644 --- a/src/include/catalog/catversion.h +++ b/src/include/catalog/catversion.h @@ -53,6 +53,6 @@ */ /* yyyymmddN */ -#define CATALOG_VERSION_NO 201903063 +#define CATALOG_VERSION_NO 201903091 #endif diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat index e9638660ff8..562c5408f84 100644 --- a/src/include/catalog/pg_proc.dat +++ b/src/include/catalog/pg_proc.dat @@ -5227,6 +5227,10 @@ proname => 'pg_stat_get_db_deadlocks', provolatile => 's', proparallel => 'r', prorettype => 'int8', proargtypes => 'oid', prosrc => 'pg_stat_get_db_deadlocks' }, +{ oid => '3426', descr => 'statistics: checksum failures detected in database', + proname => 'pg_stat_get_db_checksum_failures', provolatile => 's', proparallel => 'r', + prorettype => 'int8', proargtypes => 'oid', + prosrc => 'pg_stat_get_db_checksum_failures' }, { oid => '3074', descr => 'statistics: last reset for a database', proname => 'pg_stat_get_db_stat_reset_time', provolatile => 's', proparallel => 'r', prorettype => 'timestamptz', proargtypes => 'oid', diff --git a/src/include/pgstat.h b/src/include/pgstat.h index 88a75fb798e..725c8b0d64a 100644 --- a/src/include/pgstat.h +++ b/src/include/pgstat.h @@ -64,7 +64,8 @@ typedef enum StatMsgType PGSTAT_MTYPE_FUNCPURGE, PGSTAT_MTYPE_RECOVERYCONFLICT, PGSTAT_MTYPE_TEMPFILE, - PGSTAT_MTYPE_DEADLOCK + PGSTAT_MTYPE_DEADLOCK, + PGSTAT_MTYPE_CHECKSUMFAILURE } StatMsgType; /* ---------- @@ -530,6 +531,18 @@ typedef struct PgStat_MsgDeadlock Oid m_databaseid; } PgStat_MsgDeadlock; +/* ---------- + * PgStat_MsgChecksumFailure Sent by the backend to tell the collector + * about checksum failures noticed. + * ---------- + */ +typedef struct PgStat_MsgChecksumFailure +{ + PgStat_MsgHdr m_hdr; + Oid m_databaseid; + int m_failurecount; +} PgStat_MsgChecksumFailure; + /* ---------- * PgStat_Msg Union over all possible messages. @@ -593,6 +606,7 @@ typedef struct PgStat_StatDBEntry PgStat_Counter n_temp_files; PgStat_Counter n_temp_bytes; PgStat_Counter n_deadlocks; + PgStat_Counter n_checksum_failures; PgStat_Counter n_block_read_time; /* times in microseconds */ PgStat_Counter n_block_write_time; @@ -1200,6 +1214,8 @@ extern void pgstat_report_analyze(Relation rel, extern void pgstat_report_recovery_conflict(int reason); extern void pgstat_report_deadlock(void); +extern void pgstat_report_checksum_failures_in_db(Oid dboid, int failurecount); +extern void pgstat_report_checksum_failure(void); extern void pgstat_initialize(void); extern void pgstat_bestart(void); diff --git a/src/test/regress/expected/rules.out b/src/test/regress/expected/rules.out index 98f417cb57c..e0f2c543efa 100644 --- a/src/test/regress/expected/rules.out +++ b/src/test/regress/expected/rules.out @@ -1817,6 +1817,7 @@ pg_stat_database| SELECT d.oid AS datid, pg_stat_get_db_temp_files(d.oid) AS temp_files, pg_stat_get_db_temp_bytes(d.oid) AS temp_bytes, pg_stat_get_db_deadlocks(d.oid) AS deadlocks, + pg_stat_get_db_checksum_failures(d.oid) AS checksum_failures, pg_stat_get_db_blk_read_time(d.oid) AS blk_read_time, pg_stat_get_db_blk_write_time(d.oid) AS blk_write_time, pg_stat_get_db_stat_reset_time(d.oid) AS stats_reset