diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c index 6d8f52a590e..bc624c27a92 100644 --- a/src/backend/postmaster/pgstat.c +++ b/src/backend/postmaster/pgstat.c @@ -2685,30 +2685,65 @@ pgstat_initialize(void) * * Initialize this backend's entry in the PgBackendStatus array. * Called from InitPostgres. - * MyDatabaseId, session userid, and application_name must be set - * (hence, this cannot be combined with pgstat_initialize). + * Apart from auxiliary processes, MyBackendId, MyDatabaseId, + * session userid, and application_name must be set for a + * backend (hence, this cannot be combined with pgstat_initialize). + * Note also that we must be inside a transaction if this isn't an aux + * process, as we may need to do encoding conversion on some strings. * ---------- */ void pgstat_bestart(void) { - TimestampTz proc_start_timestamp; - Oid userid; - SockAddr clientaddr; - volatile PgBackendStatus *beentry; + volatile PgBackendStatus *vbeentry = MyBEEntry; + PgBackendStatus lbeentry; +#ifdef USE_SSL + PgBackendSSLStatus lsslstatus; +#endif + + /* pgstats state must be initialized from pgstat_initialize() */ + Assert(vbeentry != NULL); /* - * To minimize the time spent modifying the PgBackendStatus entry, fetch - * all the needed data first. + * To minimize the time spent modifying the PgBackendStatus entry, and + * avoid risk of errors inside the critical section, we first copy the + * shared-memory struct to a local variable, then modify the data in the + * local variable, then copy the local variable back to shared memory. + * Only the last step has to be inside the critical section. * + * Most of the data we copy from shared memory is just going to be + * overwritten, but the struct's not so large that it's worth the + * maintenance hassle to copy only the needful fields. + */ + memcpy(&lbeentry, + (char *) vbeentry, + sizeof(PgBackendStatus)); + + /* This struct can just start from zeroes each time, though */ +#ifdef USE_SSL + memset(&lsslstatus, 0, sizeof(lsslstatus)); +#endif + + /* + * Now fill in all the fields of lbeentry, except for strings that are + * out-of-line data. Those have to be handled separately, below. + */ + lbeentry.st_procpid = MyProcPid; + + /* * If we have a MyProcPort, use its session start time (for consistency, * and to save a kernel call). */ if (MyProcPort) - proc_start_timestamp = MyProcPort->SessionStartTime; + lbeentry.st_proc_start_timestamp = MyProcPort->SessionStartTime; else - proc_start_timestamp = GetCurrentTimestamp(); - userid = GetSessionUserId(); + lbeentry.st_proc_start_timestamp = GetCurrentTimestamp(); + + lbeentry.st_activity_start_timestamp = 0; + lbeentry.st_state_start_timestamp = 0; + lbeentry.st_xact_start_timestamp = 0; + lbeentry.st_databaseid = MyDatabaseId; + lbeentry.st_userid = GetSessionUserId(); /* * We may not have a MyProcPort (eg, if this is the autovacuum process). @@ -2716,61 +2751,32 @@ pgstat_bestart(void) * pg_stat_get_backend_client_addr and pg_stat_get_backend_client_port. */ if (MyProcPort) - memcpy(&clientaddr, &MyProcPort->raddr, sizeof(clientaddr)); + memcpy(&lbeentry.st_clientaddr, &MyProcPort->raddr, + sizeof(lbeentry.st_clientaddr)); else - MemSet(&clientaddr, 0, sizeof(clientaddr)); + MemSet(&lbeentry.st_clientaddr, 0, sizeof(lbeentry.st_clientaddr)); - /* - * Initialize my status entry, following the protocol of bumping - * st_changecount before and after; and make sure it's even afterwards. We - * use a volatile pointer here to ensure the compiler doesn't try to get - * cute. - */ - beentry = MyBEEntry; - do - { - pgstat_increment_changecount_before(beentry); - } while ((beentry->st_changecount & 1) == 0); - - beentry->st_procpid = MyProcPid; - beentry->st_proc_start_timestamp = proc_start_timestamp; - beentry->st_activity_start_timestamp = 0; - beentry->st_state_start_timestamp = 0; - beentry->st_xact_start_timestamp = 0; - beentry->st_databaseid = MyDatabaseId; - beentry->st_userid = userid; - beentry->st_clientaddr = clientaddr; - if (MyProcPort && MyProcPort->remote_hostname) - strlcpy(beentry->st_clienthostname, MyProcPort->remote_hostname, - NAMEDATALEN); - else - beentry->st_clienthostname[0] = '\0'; #ifdef USE_SSL if (MyProcPort && MyProcPort->ssl != NULL) { - beentry->st_ssl = true; - beentry->st_sslstatus->ssl_bits = be_tls_get_cipher_bits(MyProcPort); - beentry->st_sslstatus->ssl_compression = be_tls_get_compression(MyProcPort); - be_tls_get_version(MyProcPort, beentry->st_sslstatus->ssl_version, NAMEDATALEN); - be_tls_get_cipher(MyProcPort, beentry->st_sslstatus->ssl_cipher, NAMEDATALEN); - be_tls_get_peerdn_name(MyProcPort, beentry->st_sslstatus->ssl_clientdn, NAMEDATALEN); + lbeentry.st_ssl = true; + lsslstatus.ssl_bits = be_tls_get_cipher_bits(MyProcPort); + lsslstatus.ssl_compression = be_tls_get_compression(MyProcPort); + be_tls_get_version(MyProcPort, lsslstatus.ssl_version, NAMEDATALEN); + be_tls_get_cipher(MyProcPort, lsslstatus.ssl_cipher, NAMEDATALEN); + be_tls_get_peerdn_name(MyProcPort, lsslstatus.ssl_clientdn, NAMEDATALEN); } else { - beentry->st_ssl = false; + lbeentry.st_ssl = false; } #else - beentry->st_ssl = false; + lbeentry.st_ssl = false; #endif - beentry->st_state = STATE_UNDEFINED; - beentry->st_appname[0] = '\0'; - beentry->st_activity[0] = '\0'; - /* Also make sure the last byte in each string area is always 0 */ - beentry->st_clienthostname[NAMEDATALEN - 1] = '\0'; - beentry->st_appname[NAMEDATALEN - 1] = '\0'; - beentry->st_activity[pgstat_track_activity_query_size - 1] = '\0'; - beentry->st_progress_command = PROGRESS_COMMAND_INVALID; - beentry->st_progress_command_target = InvalidOid; + + lbeentry.st_state = STATE_UNDEFINED; + lbeentry.st_progress_command = PROGRESS_COMMAND_INVALID; + lbeentry.st_progress_command_target = InvalidOid; /* * we don't zero st_progress_param here to save cycles; nobody should @@ -2778,7 +2784,42 @@ pgstat_bestart(void) * than PROGRESS_COMMAND_INVALID */ - pgstat_increment_changecount_after(beentry); + /* + * We're ready to enter the critical section that fills the shared-memory + * status entry. We follow the protocol of bumping st_changecount before + * and after; and make sure it's even afterwards. We use a volatile + * pointer here to ensure the compiler doesn't try to get cute. + */ + PGSTAT_BEGIN_WRITE_ACTIVITY(vbeentry); + + /* make sure we'll memcpy the same st_changecount back */ + lbeentry.st_changecount = vbeentry->st_changecount; + + memcpy((char *) vbeentry, + &lbeentry, + sizeof(PgBackendStatus)); + + /* + * We can write the out-of-line strings and structs using the pointers + * that are in lbeentry; this saves some de-volatilizing messiness. + */ + lbeentry.st_appname[0] = '\0'; + if (MyProcPort && MyProcPort->remote_hostname) + strlcpy(lbeentry.st_clienthostname, MyProcPort->remote_hostname, + NAMEDATALEN); + else + lbeentry.st_clienthostname[0] = '\0'; + lbeentry.st_activity[0] = '\0'; + /* Also make sure the last byte in each string area is always 0 */ + lbeentry.st_appname[NAMEDATALEN - 1] = '\0'; + lbeentry.st_clienthostname[NAMEDATALEN - 1] = '\0'; + lbeentry.st_activity[pgstat_track_activity_query_size - 1] = '\0'; + +#ifdef USE_SSL + memcpy(lbeentry.st_sslstatus, &lsslstatus, sizeof(PgBackendSSLStatus)); +#endif + + PGSTAT_END_WRITE_ACTIVITY(vbeentry); /* Update app name to current GUC setting */ if (application_name) @@ -2813,11 +2854,11 @@ pgstat_beshutdown_hook(int code, Datum arg) * before and after. We use a volatile pointer here to ensure the * compiler doesn't try to get cute. */ - pgstat_increment_changecount_before(beentry); + PGSTAT_BEGIN_WRITE_ACTIVITY(beentry); beentry->st_procpid = 0; /* mark invalid */ - pgstat_increment_changecount_after(beentry); + PGSTAT_END_WRITE_ACTIVITY(beentry); } @@ -2856,7 +2897,7 @@ pgstat_report_activity(BackendState state, const char *cmd_str) * non-disabled state. As our final update, change the state and * clear fields we will not be updating anymore. */ - pgstat_increment_changecount_before(beentry); + PGSTAT_BEGIN_WRITE_ACTIVITY(beentry); beentry->st_state = STATE_DISABLED; beentry->st_state_start_timestamp = 0; beentry->st_activity[0] = '\0'; @@ -2864,14 +2905,14 @@ pgstat_report_activity(BackendState state, const char *cmd_str) /* st_xact_start_timestamp and wait_event_info are also disabled */ beentry->st_xact_start_timestamp = 0; proc->wait_event_info = 0; - pgstat_increment_changecount_after(beentry); + PGSTAT_END_WRITE_ACTIVITY(beentry); } return; } /* - * To minimize the time spent modifying the entry, fetch all the needed - * data first. + * To minimize the time spent modifying the entry, and avoid risk of + * errors inside the critical section, fetch all the needed data first. */ start_timestamp = GetCurrentStatementStartTimestamp(); if (cmd_str != NULL) @@ -2884,7 +2925,7 @@ pgstat_report_activity(BackendState state, const char *cmd_str) /* * Now update the status entry */ - pgstat_increment_changecount_before(beentry); + PGSTAT_BEGIN_WRITE_ACTIVITY(beentry); beentry->st_state = state; beentry->st_state_start_timestamp = current_timestamp; @@ -2896,7 +2937,7 @@ pgstat_report_activity(BackendState state, const char *cmd_str) beentry->st_activity_start_timestamp = start_timestamp; } - pgstat_increment_changecount_after(beentry); + PGSTAT_END_WRITE_ACTIVITY(beentry); } /*----------- @@ -2914,11 +2955,11 @@ pgstat_progress_start_command(ProgressCommandType cmdtype, Oid relid) if (!beentry || !pgstat_track_activities) return; - pgstat_increment_changecount_before(beentry); + PGSTAT_BEGIN_WRITE_ACTIVITY(beentry); beentry->st_progress_command = cmdtype; beentry->st_progress_command_target = relid; MemSet(&beentry->st_progress_param, 0, sizeof(beentry->st_progress_param)); - pgstat_increment_changecount_after(beentry); + PGSTAT_END_WRITE_ACTIVITY(beentry); } /*----------- @@ -2937,9 +2978,9 @@ pgstat_progress_update_param(int index, int64 val) if (!beentry || !pgstat_track_activities) return; - pgstat_increment_changecount_before(beentry); + PGSTAT_BEGIN_WRITE_ACTIVITY(beentry); beentry->st_progress_param[index] = val; - pgstat_increment_changecount_after(beentry); + PGSTAT_END_WRITE_ACTIVITY(beentry); } /*----------- @@ -2959,7 +3000,7 @@ pgstat_progress_update_multi_param(int nparam, const int *index, if (!beentry || !pgstat_track_activities || nparam == 0) return; - pgstat_increment_changecount_before(beentry); + PGSTAT_BEGIN_WRITE_ACTIVITY(beentry); for (i = 0; i < nparam; ++i) { @@ -2968,7 +3009,7 @@ pgstat_progress_update_multi_param(int nparam, const int *index, beentry->st_progress_param[index[i]] = val[i]; } - pgstat_increment_changecount_after(beentry); + PGSTAT_END_WRITE_ACTIVITY(beentry); } /*----------- @@ -2989,10 +3030,10 @@ pgstat_progress_end_command(void) && beentry->st_progress_command == PROGRESS_COMMAND_INVALID) return; - pgstat_increment_changecount_before(beentry); + PGSTAT_BEGIN_WRITE_ACTIVITY(beentry); beentry->st_progress_command = PROGRESS_COMMAND_INVALID; beentry->st_progress_command_target = InvalidOid; - pgstat_increment_changecount_after(beentry); + PGSTAT_END_WRITE_ACTIVITY(beentry); } /* ---------- @@ -3018,12 +3059,12 @@ pgstat_report_appname(const char *appname) * st_changecount before and after. We use a volatile pointer here to * ensure the compiler doesn't try to get cute. */ - pgstat_increment_changecount_before(beentry); + PGSTAT_BEGIN_WRITE_ACTIVITY(beentry); memcpy((char *) beentry->st_appname, appname, len); beentry->st_appname[len] = '\0'; - pgstat_increment_changecount_after(beentry); + PGSTAT_END_WRITE_ACTIVITY(beentry); } /* @@ -3043,9 +3084,11 @@ pgstat_report_xact_timestamp(TimestampTz tstamp) * st_changecount before and after. We use a volatile pointer here to * ensure the compiler doesn't try to get cute. */ - pgstat_increment_changecount_before(beentry); + PGSTAT_BEGIN_WRITE_ACTIVITY(beentry); + beentry->st_xact_start_timestamp = tstamp; - pgstat_increment_changecount_after(beentry); + + PGSTAT_END_WRITE_ACTIVITY(beentry); } /* ---------- @@ -3111,14 +3154,19 @@ pgstat_read_current_status(void) int before_changecount; int after_changecount; - pgstat_save_changecount_before(beentry, before_changecount); + pgstat_begin_read_activity(beentry, before_changecount); localentry->backendStatus.st_procpid = beentry->st_procpid; + /* Skip all the data-copying work if entry is not in use */ if (localentry->backendStatus.st_procpid > 0) { memcpy(&localentry->backendStatus, (char *) beentry, sizeof(PgBackendStatus)); /* + * For each PgBackendStatus field that is a pointer, copy the + * pointed-to data, then adjust the local copy of the pointer + * field to point at the local copy of the data. + * * strcpy is safe even if the string is modified concurrently, * because there's always a \0 at the end of the buffer. */ @@ -3128,7 +3176,6 @@ pgstat_read_current_status(void) localentry->backendStatus.st_clienthostname = localclienthostname; strcpy(localactivity, (char *) beentry->st_activity); localentry->backendStatus.st_activity = localactivity; - localentry->backendStatus.st_ssl = beentry->st_ssl; #ifdef USE_SSL if (beentry->st_ssl) { @@ -3138,9 +3185,10 @@ pgstat_read_current_status(void) #endif } - pgstat_save_changecount_after(beentry, after_changecount); - if (before_changecount == after_changecount && - (before_changecount & 1) == 0) + pgstat_end_read_activity(beentry, after_changecount); + + if (pgstat_read_activity_complete(before_changecount, + after_changecount)) break; /* Make sure we can break out of loop if stuck... */ @@ -3298,14 +3346,14 @@ pgstat_get_backend_current_activity(int pid, bool checkUser) int before_changecount; int after_changecount; - pgstat_save_changecount_before(vbeentry, before_changecount); + pgstat_begin_read_activity(vbeentry, before_changecount); found = (vbeentry->st_procpid == pid); - pgstat_save_changecount_after(vbeentry, after_changecount); + pgstat_end_read_activity(vbeentry, after_changecount); - if (before_changecount == after_changecount && - (before_changecount & 1) == 0) + if (pgstat_read_activity_complete(before_changecount, + after_changecount)) break; /* Make sure we can break out of loop if stuck... */ diff --git a/src/include/pgstat.h b/src/include/pgstat.h index 410b7a8791c..d689b527197 100644 --- a/src/include/pgstat.h +++ b/src/include/pgstat.h @@ -780,11 +780,12 @@ typedef struct PgBackendStatus * the copy is valid; otherwise start over. This makes updates cheap * while reads are potentially expensive, but that's the tradeoff we want. * - * The above protocol needs the memory barriers to ensure that the - * apparent order of execution is as it desires. Otherwise, for example, - * the CPU might rearrange the code so that st_changecount is incremented - * twice before the modification on a machine with weak memory ordering. - * This surprising result can lead to bugs. + * The above protocol needs memory barriers to ensure that the apparent + * order of execution is as it desires. Otherwise, for example, the CPU + * might rearrange the code so that st_changecount is incremented twice + * before the modification on a machine with weak memory ordering. Hence, + * use the macros defined below for manipulating st_changecount, rather + * than touching it directly. */ int st_changecount; @@ -831,41 +832,75 @@ typedef struct PgBackendStatus } PgBackendStatus; /* - * Macros to load and store st_changecount with the memory barriers. + * Macros to load and store st_changecount with appropriate memory barriers. * - * pgstat_increment_changecount_before() and - * pgstat_increment_changecount_after() need to be called before and after - * PgBackendStatus entries are modified, respectively. This makes sure that - * st_changecount is incremented around the modification. + * Use PGSTAT_BEGIN_WRITE_ACTIVITY() before, and PGSTAT_END_WRITE_ACTIVITY() + * after, modifying the current process's PgBackendStatus data. Note that, + * since there is no mechanism for cleaning up st_changecount after an error, + * THESE MACROS FORM A CRITICAL SECTION. Any error between them will be + * promoted to PANIC, causing a database restart to clean up shared memory! + * Hence, keep the critical section as short and straight-line as possible. + * Aside from being safer, that minimizes the window in which readers will + * have to loop. * - * Also pgstat_save_changecount_before() and pgstat_save_changecount_after() - * need to be called before and after PgBackendStatus entries are copied into - * private memory, respectively. + * Reader logic should follow this sketch: + * + * for (;;) + * { + * int before_ct, after_ct; + * + * pgstat_begin_read_activity(beentry, before_ct); + * ... copy beentry data to local memory ... + * pgstat_end_read_activity(beentry, after_ct); + * if (pgstat_read_activity_complete(before_ct, after_ct)) + * break; + * CHECK_FOR_INTERRUPTS(); + * } + * + * For extra safety, we generally use volatile beentry pointers, although + * the memory barriers should theoretically be sufficient. */ -#define pgstat_increment_changecount_before(beentry) \ - do { \ - beentry->st_changecount++; \ +#define PGSTAT_BEGIN_WRITE_ACTIVITY(beentry) \ + do { \ + START_CRIT_SECTION(); \ + (beentry)->st_changecount++; \ pg_write_barrier(); \ } while (0) +#define PGSTAT_END_WRITE_ACTIVITY(beentry) \ + do { \ + pg_write_barrier(); \ + (beentry)->st_changecount++; \ + Assert(((beentry)->st_changecount & 1) == 0); \ + END_CRIT_SECTION(); \ + } while (0) + +#define pgstat_begin_read_activity(beentry, before_changecount) \ + do { \ + (before_changecount) = (beentry)->st_changecount; \ + pg_read_barrier(); \ + } while (0) + +#define pgstat_end_read_activity(beentry, after_changecount) \ + do { \ + pg_read_barrier(); \ + (after_changecount) = (beentry)->st_changecount; \ + } while (0) + +#define pgstat_read_activity_complete(before_changecount, after_changecount) \ + ((before_changecount) == (after_changecount) && \ + ((before_changecount) & 1) == 0) + +/* deprecated names for above macros; these are gone in v12 */ +#define pgstat_increment_changecount_before(beentry) \ + PGSTAT_BEGIN_WRITE_ACTIVITY(beentry) #define pgstat_increment_changecount_after(beentry) \ - do { \ - pg_write_barrier(); \ - beentry->st_changecount++; \ - Assert((beentry->st_changecount & 1) == 0); \ - } while (0) + PGSTAT_END_WRITE_ACTIVITY(beentry) +#define pgstat_save_changecount_before(beentry, save_changecount) \ + pgstat_begin_read_activity(beentry, save_changecount) +#define pgstat_save_changecount_after(beentry, save_changecount) \ + pgstat_end_read_activity(beentry, save_changecount) -#define pgstat_save_changecount_before(beentry, save_changecount) \ - do { \ - save_changecount = beentry->st_changecount; \ - pg_read_barrier(); \ - } while (0) - -#define pgstat_save_changecount_after(beentry, save_changecount) \ - do { \ - pg_read_barrier(); \ - save_changecount = beentry->st_changecount; \ - } while (0) /* ---------- * LocalPgBackendStatus