1
0
mirror of https://github.com/postgres/postgres.git synced 2025-08-31 17:02:12 +03:00

Support quorum-based synchronous replication.

This feature is also known as "quorum commit" especially in discussion
on pgsql-hackers.

This commit adds the following new syntaxes into synchronous_standby_names
GUC. By using FIRST and ANY keywords, users can specify the method to
choose synchronous standbys from the listed servers.

  FIRST num_sync (standby_name [, ...])
  ANY num_sync (standby_name [, ...])

The keyword FIRST specifies a priority-based synchronous replication
which was available also in 9.6 or before. This method makes transaction
commits wait until their WAL records are replicated to num_sync
synchronous standbys chosen based on their priorities.

The keyword ANY specifies a quorum-based synchronous replication
and makes transaction commits wait until their WAL records are
replicated to *at least* num_sync listed standbys. In this method,
the values of sync_state.pg_stat_replication for the listed standbys
are reported as "quorum". The priority is still assigned to each standby,
but not used in this method.

The existing syntaxes having neither FIRST nor ANY keyword are still
supported. They are the same as new syntax with FIRST keyword, i.e.,
a priorirty-based synchronous replication.

Author: Masahiko Sawada
Reviewed-By: Michael Paquier, Amit Kapila and me
Discussion: <CAD21AoAACi9NeC_ecm+Vahm+MMA6nYh=Kqs3KB3np+MBOS_gZg@mail.gmail.com>

Many thanks to the various individuals who were involved in
discussing and developing this feature.
This commit is contained in:
Fujii Masao
2016-12-19 21:15:30 +09:00
parent 10238fad03
commit 3901fd70cc
11 changed files with 397 additions and 87 deletions

View File

@@ -30,23 +30,34 @@
* searching the through all waiters each time we receive a reply.
*
* In 9.5 or before only a single standby could be considered as
* synchronous. In 9.6 we support multiple synchronous standbys.
* The number of synchronous standbys that transactions must wait for
* replies from is specified in synchronous_standby_names.
* This parameter also specifies a list of standby names,
* which determines the priority of each standby for being chosen as
* a synchronous standby. The standbys whose names appear earlier
* in the list are given higher priority and will be considered as
* synchronous. Other standby servers appearing later in this list
* represent potential synchronous standbys. If any of the current
* synchronous standbys disconnects for whatever reason, it will be
* replaced immediately with the next-highest-priority standby.
* synchronous. In 9.6 we support a priority-based multiple synchronous
* standbys. In 10.0 a quorum-based multiple synchronous standbys is also
* supported. The number of synchronous standbys that transactions
* must wait for replies from is specified in synchronous_standby_names.
* This parameter also specifies a list of standby names and the method
* (FIRST and ANY) to choose synchronous standbys from the listed ones.
*
* The method FIRST specifies a priority-based synchronous replication
* and makes transaction commits wait until their WAL records are
* replicated to the requested number of synchronous standbys chosen based
* on their priorities. The standbys whose names appear earlier in the list
* are given higher priority and will be considered as synchronous.
* Other standby servers appearing later in this list represent potential
* synchronous standbys. If any of the current synchronous standbys
* disconnects for whatever reason, it will be replaced immediately with
* the next-highest-priority standby.
*
* The method ANY specifies a quorum-based synchronous replication
* and makes transaction commits wait until their WAL records are
* replicated to at least the requested number of synchronous standbys
* in the list. All the standbys appearing in the list are considered as
* candidates for quorum synchronous standbys.
*
* Before the standbys chosen from synchronous_standby_names can
* become the synchronous standbys they must have caught up with
* the primary; that may take some time. Once caught up,
* the current higher priority standbys which are considered as
* synchronous at that moment will release waiters from the queue.
* the standbys which are considered as synchronous at that moment
* will release waiters from the queue.
*
* Portions Copyright (c) 2010-2016, PostgreSQL Global Development Group
*
@@ -79,18 +90,29 @@ char *SyncRepStandbyNames;
static bool announce_next_takeover = true;
static SyncRepConfigData *SyncRepConfig = NULL;
SyncRepConfigData *SyncRepConfig = NULL;
static int SyncRepWaitMode = SYNC_REP_NO_WAIT;
static void SyncRepQueueInsert(int mode);
static void SyncRepCancelWait(void);
static int SyncRepWakeQueue(bool all, int mode);
static bool SyncRepGetOldestSyncRecPtr(XLogRecPtr *writePtr,
XLogRecPtr *flushPtr,
XLogRecPtr *applyPtr,
bool *am_sync);
static bool SyncRepGetSyncRecPtr(XLogRecPtr *writePtr,
XLogRecPtr *flushPtr,
XLogRecPtr *applyPtr,
bool *am_sync);
static void SyncRepGetOldestSyncRecPtr(XLogRecPtr *writePtr,
XLogRecPtr *flushPtr,
XLogRecPtr *applyPtr,
List *sync_standbys);
static void SyncRepGetNthLatestSyncRecPtr(XLogRecPtr *writePtr,
XLogRecPtr *flushPtr,
XLogRecPtr *applyPtr,
List *sync_standbys, uint8 nth);
static int SyncRepGetStandbyPriority(void);
static List *SyncRepGetSyncStandbysPriority(bool *am_sync);
static List *SyncRepGetSyncStandbysQuorum(bool *am_sync);
static int cmp_lsn(const void *a, const void *b);
#ifdef USE_ASSERT_CHECKING
static bool SyncRepQueueIsOrderedByLSN(int mode);
@@ -386,7 +408,7 @@ SyncRepReleaseWaiters(void)
XLogRecPtr writePtr;
XLogRecPtr flushPtr;
XLogRecPtr applyPtr;
bool got_oldest;
bool got_recptr;
bool am_sync;
int numwrite = 0;
int numflush = 0;
@@ -413,11 +435,10 @@ SyncRepReleaseWaiters(void)
LWLockAcquire(SyncRepLock, LW_EXCLUSIVE);
/*
* Check whether we are a sync standby or not, and calculate the oldest
* Check whether we are a sync standby or not, and calculate the synced
* positions among all sync standbys.
*/
got_oldest = SyncRepGetOldestSyncRecPtr(&writePtr, &flushPtr,
&applyPtr, &am_sync);
got_recptr = SyncRepGetSyncRecPtr(&writePtr, &flushPtr, &applyPtr, &am_sync);
/*
* If we are managing a sync standby, though we weren't prior to this,
@@ -426,16 +447,22 @@ SyncRepReleaseWaiters(void)
if (announce_next_takeover && am_sync)
{
announce_next_takeover = false;
ereport(LOG,
(errmsg("standby \"%s\" is now a synchronous standby with priority %u",
application_name, MyWalSnd->sync_standby_priority)));
if (SyncRepConfig->syncrep_method == SYNC_REP_PRIORITY)
ereport(LOG,
(errmsg("standby \"%s\" is now a synchronous standby with priority %u",
application_name, MyWalSnd->sync_standby_priority)));
else
ereport(LOG,
(errmsg("standby \"%s\" is now a candidate for quorum synchronous standby",
application_name)));
}
/*
* If the number of sync standbys is less than requested or we aren't
* managing a sync standby then just leave.
*/
if (!got_oldest || !am_sync)
if (!got_recptr || !am_sync)
{
LWLockRelease(SyncRepLock);
announce_next_takeover = !am_sync;
@@ -471,21 +498,20 @@ SyncRepReleaseWaiters(void)
}
/*
* Calculate the oldest Write, Flush and Apply positions among sync standbys.
* Calculate the synced Write, Flush and Apply positions among sync standbys.
*
* Return false if the number of sync standbys is less than
* synchronous_standby_names specifies. Otherwise return true and
* store the oldest positions into *writePtr, *flushPtr and *applyPtr.
* store the positions into *writePtr, *flushPtr and *applyPtr.
*
* On return, *am_sync is set to true if this walsender is connecting to
* sync standby. Otherwise it's set to false.
*/
static bool
SyncRepGetOldestSyncRecPtr(XLogRecPtr *writePtr, XLogRecPtr *flushPtr,
SyncRepGetSyncRecPtr(XLogRecPtr *writePtr, XLogRecPtr *flushPtr,
XLogRecPtr *applyPtr, bool *am_sync)
{
List *sync_standbys;
ListCell *cell;
*writePtr = InvalidXLogRecPtr;
*flushPtr = InvalidXLogRecPtr;
@@ -508,12 +534,49 @@ SyncRepGetOldestSyncRecPtr(XLogRecPtr *writePtr, XLogRecPtr *flushPtr,
}
/*
* Scan through all sync standbys and calculate the oldest Write, Flush
* and Apply positions.
* In a priority-based sync replication, the synced positions are the
* oldest ones among sync standbys. In a quorum-based, they are the Nth
* latest ones.
*
* SyncRepGetNthLatestSyncRecPtr() also can calculate the oldest positions.
* But we use SyncRepGetOldestSyncRecPtr() for that calculation because
* it's a bit more efficient.
*
* XXX If the numbers of current and requested sync standbys are the same,
* we can use SyncRepGetOldestSyncRecPtr() to calculate the synced
* positions even in a quorum-based sync replication.
*/
foreach(cell, sync_standbys)
if (SyncRepConfig->syncrep_method == SYNC_REP_PRIORITY)
{
WalSnd *walsnd = &WalSndCtl->walsnds[lfirst_int(cell)];
SyncRepGetOldestSyncRecPtr(writePtr, flushPtr, applyPtr,
sync_standbys);
}
else
{
SyncRepGetNthLatestSyncRecPtr(writePtr, flushPtr, applyPtr,
sync_standbys, SyncRepConfig->num_sync);
}
list_free(sync_standbys);
return true;
}
/*
* Calculate the oldest Write, Flush and Apply positions among sync standbys.
*/
static void
SyncRepGetOldestSyncRecPtr(XLogRecPtr *writePtr, XLogRecPtr *flushPtr,
XLogRecPtr *applyPtr, List *sync_standbys)
{
ListCell *cell;
/*
* Scan through all sync standbys and calculate the oldest
* Write, Flush and Apply positions.
*/
foreach (cell, sync_standbys)
{
WalSnd *walsnd = &WalSndCtl->walsnds[lfirst_int(cell)];
XLogRecPtr write;
XLogRecPtr flush;
XLogRecPtr apply;
@@ -531,23 +594,163 @@ SyncRepGetOldestSyncRecPtr(XLogRecPtr *writePtr, XLogRecPtr *flushPtr,
if (XLogRecPtrIsInvalid(*applyPtr) || *applyPtr > apply)
*applyPtr = apply;
}
}
list_free(sync_standbys);
return true;
/*
* Calculate the Nth latest Write, Flush and Apply positions among sync
* standbys.
*/
static void
SyncRepGetNthLatestSyncRecPtr(XLogRecPtr *writePtr, XLogRecPtr *flushPtr,
XLogRecPtr *applyPtr, List *sync_standbys, uint8 nth)
{
ListCell *cell;
XLogRecPtr *write_array;
XLogRecPtr *flush_array;
XLogRecPtr *apply_array;
int len;
int i = 0;
len = list_length(sync_standbys);
write_array = (XLogRecPtr *) palloc(sizeof(XLogRecPtr) * len);
flush_array = (XLogRecPtr *) palloc(sizeof(XLogRecPtr) * len);
apply_array = (XLogRecPtr *) palloc(sizeof(XLogRecPtr) * len);
foreach (cell, sync_standbys)
{
WalSnd *walsnd = &WalSndCtl->walsnds[lfirst_int(cell)];
SpinLockAcquire(&walsnd->mutex);
write_array[i] = walsnd->write;
flush_array[i] = walsnd->flush;
apply_array[i] = walsnd->apply;
SpinLockRelease(&walsnd->mutex);
i++;
}
qsort(write_array, len, sizeof(XLogRecPtr), cmp_lsn);
qsort(flush_array, len, sizeof(XLogRecPtr), cmp_lsn);
qsort(apply_array, len, sizeof(XLogRecPtr), cmp_lsn);
/* Get Nth latest Write, Flush, Apply positions */
*writePtr = write_array[nth - 1];
*flushPtr = flush_array[nth - 1];
*applyPtr = apply_array[nth - 1];
pfree(write_array);
pfree(flush_array);
pfree(apply_array);
}
/*
* Compare lsn in order to sort array in descending order.
*/
static int
cmp_lsn(const void *a, const void *b)
{
XLogRecPtr lsn1 = *((const XLogRecPtr *) a);
XLogRecPtr lsn2 = *((const XLogRecPtr *) b);
if (lsn1 > lsn2)
return -1;
else if (lsn1 == lsn2)
return 0;
else
return 1;
}
/*
* Return the list of sync standbys, or NIL if no sync standby is connected.
*
* If there are multiple standbys with the same priority,
* the first one found is selected preferentially.
* The caller must hold SyncRepLock.
*
* On return, *am_sync is set to true if this walsender is connecting to
* sync standby. Otherwise it's set to false.
*/
List *
SyncRepGetSyncStandbys(bool *am_sync)
SyncRepGetSyncStandbys(bool *am_sync)
{
/* Set default result */
if (am_sync != NULL)
*am_sync = false;
/* Quick exit if sync replication is not requested */
if (SyncRepConfig == NULL)
return NIL;
return (SyncRepConfig->syncrep_method == SYNC_REP_PRIORITY) ?
SyncRepGetSyncStandbysPriority(am_sync) :
SyncRepGetSyncStandbysQuorum(am_sync);
}
/*
* Return the list of all the candidates for quorum sync standbys,
* or NIL if no such standby is connected.
*
* The caller must hold SyncRepLock. This function must be called only in
* a quorum-based sync replication.
*
* On return, *am_sync is set to true if this walsender is connecting to
* sync standby. Otherwise it's set to false.
*/
static List *
SyncRepGetSyncStandbysQuorum(bool *am_sync)
{
List *result = NIL;
int i;
volatile WalSnd *walsnd; /* Use volatile pointer to prevent code
* rearrangement */
Assert(SyncRepConfig->syncrep_method == SYNC_REP_QUORUM);
for (i = 0; i < max_wal_senders; i++)
{
walsnd = &WalSndCtl->walsnds[i];
/* Must be active */
if (walsnd->pid == 0)
continue;
/* Must be streaming */
if (walsnd->state != WALSNDSTATE_STREAMING)
continue;
/* Must be synchronous */
if (walsnd->sync_standby_priority == 0)
continue;
/* Must have a valid flush position */
if (XLogRecPtrIsInvalid(walsnd->flush))
continue;
/*
* Consider this standby as a candidate for quorum sync standbys
* and append it to the result.
*/
result = lappend_int(result, i);
if (am_sync != NULL && walsnd == MyWalSnd)
*am_sync = true;
}
return result;
}
/*
* Return the list of sync standbys chosen based on their priorities,
* or NIL if no sync standby is connected.
*
* If there are multiple standbys with the same priority,
* the first one found is selected preferentially.
*
* The caller must hold SyncRepLock. This function must be called only in
* a priority-based sync replication.
*
* On return, *am_sync is set to true if this walsender is connecting to
* sync standby. Otherwise it's set to false.
*/
static List *
SyncRepGetSyncStandbysPriority(bool *am_sync)
{
List *result = NIL;
List *pending = NIL;
@@ -560,13 +763,7 @@ SyncRepGetSyncStandbys(bool *am_sync)
volatile WalSnd *walsnd; /* Use volatile pointer to prevent code
* rearrangement */
/* Set default result */
if (am_sync != NULL)
*am_sync = false;
/* Quick exit if sync replication is not requested */
if (SyncRepConfig == NULL)
return NIL;
Assert(SyncRepConfig->syncrep_method == SYNC_REP_PRIORITY);
lowest_priority = SyncRepConfig->nmembers;
next_highest_priority = lowest_priority + 1;