diff --git a/doc/src/sgml/high-availability.sgml b/doc/src/sgml/high-availability.sgml index bb5d9962ed3..b5d32bb720a 100644 --- a/doc/src/sgml/high-availability.sgml +++ b/doc/src/sgml/high-availability.sgml @@ -2148,18 +2148,14 @@ LOG: database system is ready to accept read only connections - The setting of some parameters on the standby will need reconfiguration - if they have been changed on the primary. For these parameters, - the value on the standby must - be equal to or greater than the value on the primary. - Therefore, if you want to increase these values, you should do so on all - standby servers first, before applying the changes to the primary server. - Conversely, if you want to decrease these values, you should do so on the - primary server first, before applying the changes to all standby servers. - If these parameters - are not set high enough then the standby will refuse to start. - Higher values can then be supplied and the server - restarted to begin recovery again. These parameters are: + The settings of some parameters determine the size of shared memory for + tracking transaction IDs, locks, and prepared transactions. These shared + memory structures should be no smaller on a standby than on the primary. + Otherwise, it could happen that the standby runs out of shared memory + during recovery. For example, if the primary uses a prepared transaction + but the standby did not allocate any shared memory for tracking prepared + transactions, then recovery will abort and cannot continue until the + standby's configuration is changed. The parameters affected are: @@ -2188,6 +2184,34 @@ LOG: database system is ready to accept read only connections + + The easiest way to ensure this does not become a problem is to have these + parameters set on the standbys to values equal to or greater than on the + primary. Therefore, if you want to increase these values, you should do + so on all standby servers first, before applying the changes to the + primary server. Conversely, if you want to decrease these values, you + should do so on the primary server first, before applying the changes to + all standby servers. The WAL tracks changes to these parameters on the + primary, and if a standby processes WAL that indicates that the current + value on the primary is higher than its own value, it will log a warning, for example: + +WARNING: insufficient setting for parameter max_connections +DETAIL: max_connections = 80 is a lower setting than on the master server (where its value was 100). +HINT: Change parameters and restart the server, or there may be resource exhaustion errors sooner or later. + + Recovery will continue but could abort at any time thereafter. (It could + also never end up failing if the activity on the primary does not actually + require the full extent of the allocated shared memory resources.) If + recovery reaches a point where it cannot continue due to lack of shared + memory, recovery will pause and another warning will be logged, for example: + +WARNING: recovery paused because of insufficient parameter settings +DETAIL: See earlier in the log about which settings are insufficient. +HINT: Recovery cannot continue unless the configuration is changed and the server restarted. + + This warning will repeated once a minute. At that point, the settings on + the standby need to be updated and the instance restarted before recovery + can continue. diff --git a/src/backend/access/transam/twophase.c b/src/backend/access/transam/twophase.c index 5adf956f413..ce35f15f347 100644 --- a/src/backend/access/transam/twophase.c +++ b/src/backend/access/transam/twophase.c @@ -2360,11 +2360,14 @@ PrepareRedoAdd(char *buf, XLogRecPtr start_lsn, /* Get a free gxact from the freelist */ if (TwoPhaseState->freeGXacts == NULL) + { + StandbyParamErrorPauseRecovery(); ereport(ERROR, (errcode(ERRCODE_OUT_OF_MEMORY), errmsg("maximum number of prepared transactions reached"), errhint("Increase max_prepared_transactions (currently %d).", max_prepared_xacts))); + } gxact = TwoPhaseState->freeGXacts; TwoPhaseState->freeGXacts = gxact->next; diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 8fe92962b0d..1951103b262 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -264,6 +264,8 @@ bool InArchiveRecovery = false; static bool standby_signal_file_found = false; static bool recovery_signal_file_found = false; +static bool need_restart_for_parameter_values = false; + /* Was the last xlog file restored from archive, or local? */ static bool restoredFromArchive = false; @@ -5998,6 +6000,54 @@ SetRecoveryPause(bool recoveryPause) SpinLockRelease(&XLogCtl->info_lck); } +/* + * If in hot standby, pause recovery because of a parameter conflict. + * + * Similar to recoveryPausesHere() but with a different messaging. The user + * is expected to make the parameter change and restart the server. If they + * just unpause recovery, they will then run into whatever error is after this + * function call for the non-hot-standby case. + * + * We intentionally do not give advice about specific parameters or values + * here because it might be misleading. For example, if we run out of lock + * space, then in the single-server case we would recommend raising + * max_locks_per_transaction, but in recovery it could equally be the case + * that max_connections is out of sync with the primary. If we get here, we + * have already logged any parameter discrepancies in + * RecoveryRequiresIntParameter(), so users can go back to that and get + * concrete and accurate information. + */ +void +StandbyParamErrorPauseRecovery(void) +{ + TimestampTz last_warning = 0; + + if (!AmStartupProcess() || !need_restart_for_parameter_values) + return; + + SetRecoveryPause(true); + + do + { + TimestampTz now = GetCurrentTimestamp(); + + if (TimestampDifferenceExceeds(last_warning, now, 60000)) + { + ereport(WARNING, + (errmsg("recovery paused because of insufficient parameter settings"), + errdetail("See earlier in the log about which settings are insufficient."), + errhint("Recovery cannot continue unless the configuration is changed and the server restarted."))); + last_warning = now; + } + + pgstat_report_wait_start(WAIT_EVENT_RECOVERY_PAUSE); + pg_usleep(1000000L); /* 1000 ms */ + pgstat_report_wait_end(); + HandleStartupProcInterrupts(); + } + while (RecoveryIsPaused()); +} + /* * When recovery_min_apply_delay is set, we wait long enough to make sure * certain record types are applied at least that interval behind the master. @@ -6177,16 +6227,20 @@ GetXLogReceiptTime(TimestampTz *rtime, bool *fromStream) * Note that text field supplied is a parameter name and does not require * translation */ -#define RecoveryRequiresIntParameter(param_name, currValue, minValue) \ -do { \ - if ((currValue) < (minValue)) \ - ereport(ERROR, \ - (errcode(ERRCODE_INVALID_PARAMETER_VALUE), \ - errmsg("hot standby is not possible because %s = %d is a lower setting than on the master server (its value was %d)", \ - param_name, \ - currValue, \ - minValue))); \ -} while(0) +static void +RecoveryRequiresIntParameter(const char *param_name, int currValue, int minValue) +{ + if (currValue < minValue) + { + ereport(WARNING, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("insufficient setting for parameter %s", param_name), + errdetail("%s = %d is a lower setting than on the master server (where its value was %d).", + param_name, currValue, minValue), + errhint("Change parameters and restart the server, or there may be resource exhaustion errors sooner or later."))); + need_restart_for_parameter_values = true; + } +} /* * Check to see if required parameters are set high enough on this server diff --git a/src/backend/storage/ipc/procarray.c b/src/backend/storage/ipc/procarray.c index f45a619deb8..cfb88db4a4d 100644 --- a/src/backend/storage/ipc/procarray.c +++ b/src/backend/storage/ipc/procarray.c @@ -3654,7 +3654,14 @@ KnownAssignedXidsAdd(TransactionId from_xid, TransactionId to_xid, * If it still won't fit then we're out of memory */ if (head + nxids > pArray->maxKnownAssignedXids) - elog(ERROR, "too many KnownAssignedXids"); + { + StandbyParamErrorPauseRecovery(); + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of shared memory"), + errdetail("There are no more KnownAssignedXids slots."), + errhint("You might need to increase max_connections."))); + } } /* Now we can insert the xids into the space starting at head */ diff --git a/src/backend/storage/lmgr/lock.c b/src/backend/storage/lmgr/lock.c index 3013ef63d05..c8def1674f4 100644 --- a/src/backend/storage/lmgr/lock.c +++ b/src/backend/storage/lmgr/lock.c @@ -965,10 +965,13 @@ LockAcquireExtended(const LOCKTAG *locktag, if (locallockp) *locallockp = NULL; if (reportMemoryError) + { + StandbyParamErrorPauseRecovery(); ereport(ERROR, (errcode(ERRCODE_OUT_OF_MEMORY), errmsg("out of shared memory"), errhint("You might need to increase max_locks_per_transaction."))); + } else return LOCKACQUIRE_NOT_AVAIL; } @@ -1003,10 +1006,13 @@ LockAcquireExtended(const LOCKTAG *locktag, if (locallockp) *locallockp = NULL; if (reportMemoryError) + { + StandbyParamErrorPauseRecovery(); ereport(ERROR, (errcode(ERRCODE_OUT_OF_MEMORY), errmsg("out of shared memory"), errhint("You might need to increase max_locks_per_transaction."))); + } else return LOCKACQUIRE_NOT_AVAIL; } @@ -2828,6 +2834,7 @@ FastPathGetRelationLockEntry(LOCALLOCK *locallock) { LWLockRelease(partitionLock); LWLockRelease(&MyProc->backendLock); + StandbyParamErrorPauseRecovery(); ereport(ERROR, (errcode(ERRCODE_OUT_OF_MEMORY), errmsg("out of shared memory"), @@ -4158,6 +4165,7 @@ lock_twophase_recover(TransactionId xid, uint16 info, if (!lock) { LWLockRelease(partitionLock); + StandbyParamErrorPauseRecovery(); ereport(ERROR, (errcode(ERRCODE_OUT_OF_MEMORY), errmsg("out of shared memory"), @@ -4223,6 +4231,7 @@ lock_twophase_recover(TransactionId xid, uint16 info, elog(PANIC, "lock table corrupted"); } LWLockRelease(partitionLock); + StandbyParamErrorPauseRecovery(); ereport(ERROR, (errcode(ERRCODE_OUT_OF_MEMORY), errmsg("out of shared memory"), @@ -4515,6 +4524,7 @@ VirtualXactLock(VirtualTransactionId vxid, bool wait) { LWLockRelease(partitionLock); LWLockRelease(&proc->backendLock); + StandbyParamErrorPauseRecovery(); ereport(ERROR, (errcode(ERRCODE_OUT_OF_MEMORY), errmsg("out of shared memory"), diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h index 2b1b67d35c1..9ec7b31cce1 100644 --- a/src/include/access/xlog.h +++ b/src/include/access/xlog.h @@ -287,6 +287,7 @@ extern XLogRecPtr GetXLogInsertRecPtr(void); extern XLogRecPtr GetXLogWriteRecPtr(void); extern bool RecoveryIsPaused(void); extern void SetRecoveryPause(bool recoveryPause); +extern void StandbyParamErrorPauseRecovery(void); extern TimestampTz GetLatestXTime(void); extern TimestampTz GetCurrentChunkReplayStartTime(void);