mirror of
				https://github.com/postgres/postgres.git
				synced 2025-10-25 13:17:41 +03:00 
			
		
		
		
	Add GUC to control the time to wait before retrieving WAL after failed attempt.
Previously when the standby server failed to retrieve WAL files from any sources (i.e., streaming replication, local pg_xlog directory or WAL archive), it always waited for five seconds (hard-coded) before the next attempt. For example, this is problematic in warm-standby because restore_command can fail every five seconds even while new WAL file is expected to be unavailable for a long time and flood the log files with its error messages. This commit adds new parameter, wal_retrieve_retry_interval, to control that wait time. Alexey Vasiliev and Michael Paquier, reviewed by Andres Freund and me.
This commit is contained in:
		| @@ -2985,6 +2985,24 @@ include_dir 'conf.d' | |||||||
|       </listitem> |       </listitem> | ||||||
|      </varlistentry> |      </varlistentry> | ||||||
|  |  | ||||||
|  |      <varlistentry id="guc-wal-retrieve-retry-interval" xreflabel="wal_retrieve_retry_interval"> | ||||||
|  |       <term><varname>wal_retrieve_retry_interval</varname> (<type>integer</type>) | ||||||
|  |       <indexterm> | ||||||
|  |        <primary><varname>wal_retrieve_retry_interval</> configuration parameter</primary> | ||||||
|  |       </indexterm> | ||||||
|  |       </term> | ||||||
|  |       <listitem> | ||||||
|  |        <para> | ||||||
|  |         Specify how long the standby server should wait when WAL data is not | ||||||
|  |         available from any sources (streaming replication, | ||||||
|  |         local <filename>pg_xlog</> or WAL archive) before retrying to | ||||||
|  |         retrieve WAL data.  This parameter can only be set in the | ||||||
|  |         <filename>postgresql.conf</> file or on the server command line. | ||||||
|  |         The default value is 5 seconds. Units are milliseconds if not specified. | ||||||
|  |        </para> | ||||||
|  |       </listitem> | ||||||
|  |      </varlistentry> | ||||||
|  |  | ||||||
|      </variablelist> |      </variablelist> | ||||||
|     </sect2> |     </sect2> | ||||||
|    </sect1> |    </sect1> | ||||||
|   | |||||||
| @@ -93,6 +93,7 @@ int			sync_method = DEFAULT_SYNC_METHOD; | |||||||
| int			wal_level = WAL_LEVEL_MINIMAL; | int			wal_level = WAL_LEVEL_MINIMAL; | ||||||
| int			CommitDelay = 0;	/* precommit delay in microseconds */ | int			CommitDelay = 0;	/* precommit delay in microseconds */ | ||||||
| int			CommitSiblings = 5; /* # concurrent xacts needed to sleep */ | int			CommitSiblings = 5; /* # concurrent xacts needed to sleep */ | ||||||
|  | int			wal_retrieve_retry_interval = 5000; | ||||||
|  |  | ||||||
| #ifdef WAL_DEBUG | #ifdef WAL_DEBUG | ||||||
| bool		XLOG_DEBUG = false; | bool		XLOG_DEBUG = false; | ||||||
| @@ -10340,8 +10341,8 @@ static bool | |||||||
| WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess, | WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess, | ||||||
| 							bool fetching_ckpt, XLogRecPtr tliRecPtr) | 							bool fetching_ckpt, XLogRecPtr tliRecPtr) | ||||||
| { | { | ||||||
| 	static pg_time_t last_fail_time = 0; | 	static TimestampTz	last_fail_time = 0; | ||||||
| 	pg_time_t	now; | 	TimestampTz	now; | ||||||
|  |  | ||||||
| 	/*------- | 	/*------- | ||||||
| 	 * Standby mode is implemented by a state machine: | 	 * Standby mode is implemented by a state machine: | ||||||
| @@ -10351,7 +10352,7 @@ WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess, | |||||||
| 	 * 2. Check trigger file | 	 * 2. Check trigger file | ||||||
| 	 * 3. Read from primary server via walreceiver (XLOG_FROM_STREAM) | 	 * 3. Read from primary server via walreceiver (XLOG_FROM_STREAM) | ||||||
| 	 * 4. Rescan timelines | 	 * 4. Rescan timelines | ||||||
| 	 * 5. Sleep 5 seconds, and loop back to 1. | 	 * 5. Sleep wal_retrieve_retry_interval milliseconds, and loop back to 1. | ||||||
| 	 * | 	 * | ||||||
| 	 * Failure to read from the current source advances the state machine to | 	 * Failure to read from the current source advances the state machine to | ||||||
| 	 * the next state. | 	 * the next state. | ||||||
| @@ -10490,14 +10491,25 @@ WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess, | |||||||
| 					 * machine, so we've exhausted all the options for | 					 * machine, so we've exhausted all the options for | ||||||
| 					 * obtaining the requested WAL. We're going to loop back | 					 * obtaining the requested WAL. We're going to loop back | ||||||
| 					 * and retry from the archive, but if it hasn't been long | 					 * and retry from the archive, but if it hasn't been long | ||||||
| 					 * since last attempt, sleep 5 seconds to avoid | 					 * since last attempt, sleep wal_retrieve_retry_interval | ||||||
| 					 * busy-waiting. | 					 * milliseconds to avoid busy-waiting. | ||||||
| 					 */ | 					 */ | ||||||
| 					now = (pg_time_t) time(NULL); | 					now = GetCurrentTimestamp(); | ||||||
| 					if ((now - last_fail_time) < 5) | 					if (!TimestampDifferenceExceeds(last_fail_time, now, | ||||||
|  | 													wal_retrieve_retry_interval)) | ||||||
| 					{ | 					{ | ||||||
| 						pg_usleep(1000000L * (5 - (now - last_fail_time))); | 						long		secs, wait_time; | ||||||
| 						now = (pg_time_t) time(NULL); | 						int			usecs; | ||||||
|  |  | ||||||
|  | 						TimestampDifference(last_fail_time, now, &secs, &usecs); | ||||||
|  | 						wait_time = wal_retrieve_retry_interval - | ||||||
|  | 							(secs * 1000 + usecs / 1000); | ||||||
|  |  | ||||||
|  | 						WaitLatch(&XLogCtl->recoveryWakeupLatch, | ||||||
|  | 								  WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH, | ||||||
|  | 								  wait_time); | ||||||
|  | 						ResetLatch(&XLogCtl->recoveryWakeupLatch); | ||||||
|  | 						now = GetCurrentTimestamp(); | ||||||
| 					} | 					} | ||||||
| 					last_fail_time = now; | 					last_fail_time = now; | ||||||
| 					currentSource = XLOG_FROM_ARCHIVE; | 					currentSource = XLOG_FROM_ARCHIVE; | ||||||
| @@ -10653,12 +10665,11 @@ WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess, | |||||||
| 					} | 					} | ||||||
|  |  | ||||||
| 					/* | 					/* | ||||||
| 					 * Wait for more WAL to arrive. Time out after 5 seconds, | 					 * Wait for more WAL to arrive. Time out after 5 seconds | ||||||
| 					 * like when polling the archive, to react to a trigger | 					 * to react to a trigger file promptly. | ||||||
| 					 * file promptly. |  | ||||||
| 					 */ | 					 */ | ||||||
| 					WaitLatch(&XLogCtl->recoveryWakeupLatch, | 					WaitLatch(&XLogCtl->recoveryWakeupLatch, | ||||||
| 							  WL_LATCH_SET | WL_TIMEOUT, | 							  WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH, | ||||||
| 							  5000L); | 							  5000L); | ||||||
| 					ResetLatch(&XLogCtl->recoveryWakeupLatch); | 					ResetLatch(&XLogCtl->recoveryWakeupLatch); | ||||||
| 					break; | 					break; | ||||||
|   | |||||||
| @@ -2363,6 +2363,18 @@ static struct config_int ConfigureNamesInt[] = | |||||||
| 		NULL, NULL, NULL | 		NULL, NULL, NULL | ||||||
| 	}, | 	}, | ||||||
|  |  | ||||||
|  | 	{ | ||||||
|  | 		{"wal_retrieve_retry_interval", PGC_SIGHUP, REPLICATION_STANDBY, | ||||||
|  | 			gettext_noop("Sets the time to wait before retrying to retrieve WAL" | ||||||
|  | 						 "after a failed attempt."), | ||||||
|  | 			NULL, | ||||||
|  | 			GUC_UNIT_MS | ||||||
|  | 		}, | ||||||
|  | 		&wal_retrieve_retry_interval, | ||||||
|  | 		5000, 1, INT_MAX, | ||||||
|  | 		NULL, NULL, NULL | ||||||
|  | 	}, | ||||||
|  |  | ||||||
| 	{ | 	{ | ||||||
| 		{"wal_segment_size", PGC_INTERNAL, PRESET_OPTIONS, | 		{"wal_segment_size", PGC_INTERNAL, PRESET_OPTIONS, | ||||||
| 			gettext_noop("Shows the number of pages per write ahead log segment."), | 			gettext_noop("Shows the number of pages per write ahead log segment."), | ||||||
|   | |||||||
| @@ -260,6 +260,8 @@ | |||||||
| #wal_receiver_timeout = 60s		# time that receiver waits for | #wal_receiver_timeout = 60s		# time that receiver waits for | ||||||
| 					# communication from master | 					# communication from master | ||||||
| 					# in milliseconds; 0 disables | 					# in milliseconds; 0 disables | ||||||
|  | #wal_retrieve_retry_interval = 5s	# time to wait before retrying to | ||||||
|  | 					# retrieve WAL after a failed attempt | ||||||
|  |  | ||||||
|  |  | ||||||
| #------------------------------------------------------------------------------ | #------------------------------------------------------------------------------ | ||||||
|   | |||||||
| @@ -93,6 +93,7 @@ extern int	CheckPointSegments; | |||||||
| extern int	wal_keep_segments; | extern int	wal_keep_segments; | ||||||
| extern int	XLOGbuffers; | extern int	XLOGbuffers; | ||||||
| extern int	XLogArchiveTimeout; | extern int	XLogArchiveTimeout; | ||||||
|  | extern int	wal_retrieve_retry_interval; | ||||||
| extern bool XLogArchiveMode; | extern bool XLogArchiveMode; | ||||||
| extern char *XLogArchiveCommand; | extern char *XLogArchiveCommand; | ||||||
| extern bool EnableHotStandby; | extern bool EnableHotStandby; | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user