1
0
mirror of https://github.com/postgres/postgres.git synced 2025-11-09 06:21:09 +03:00

Efficient transaction-controlled synchronous replication.

If a standby is broadcasting reply messages and we have named
one or more standbys in synchronous_standby_names then allow
users who set synchronous_replication to wait for commit, which
then provides strict data integrity guarantees. Design avoids
sending and receiving transaction state information so minimises
bookkeeping overheads. We synchronize with the highest priority
standby that is connected and ready to synchronize. Other standbys
can be defined to takeover in case of standby failure.

This version has very strict behaviour; more relaxed options
may be added at a later date.

Simon Riggs and Fujii Masao, with reviews by Yeb Havinga, Jaime
Casanova, Heikki Linnakangas and Robert Haas, plus the assistance
of many other design reviewers.
This commit is contained in:
Simon Riggs
2011-03-06 22:49:16 +00:00
parent 149b2673c2
commit a8a8a3e096
21 changed files with 507 additions and 22 deletions

View File

@@ -13,7 +13,7 @@ top_builddir = ../../..
include $(top_builddir)/src/Makefile.global
OBJS = walsender.o walreceiverfuncs.o walreceiver.o basebackup.o \
repl_gram.o
repl_gram.o syncrep.o
include $(top_srcdir)/src/backend/common.mk

View File

@@ -317,13 +317,9 @@ WalReceiverMain(void)
while (walrcv_receive(0, &type, &buf, &len))
XLogWalRcvProcessMsg(type, buf, len);
/* Let the master know that we received some data. */
XLogWalRcvSendReply();
XLogWalRcvSendHSFeedback();
/*
* If we've written some records, flush them to disk and let the
* startup process know about them.
* startup process and primary server know about them.
*/
XLogWalRcvFlush(false);
}
@@ -581,7 +577,10 @@ XLogWalRcvFlush(bool dying)
/* Also let the master know that we made some progress */
if (!dying)
{
XLogWalRcvSendReply();
XLogWalRcvSendHSFeedback();
}
}
}

View File

@@ -66,7 +66,7 @@
WalSndCtlData *WalSndCtl = NULL;
/* My slot in the shared memory array */
static WalSnd *MyWalSnd = NULL;
WalSnd *MyWalSnd = NULL;
/* Global state */
bool am_walsender = false; /* Am I a walsender process ? */
@@ -174,6 +174,8 @@ WalSenderMain(void)
SpinLockRelease(&walsnd->mutex);
}
SyncRepInitConfig();
/* Main loop of walsender */
return WalSndLoop();
}
@@ -584,6 +586,8 @@ ProcessStandbyReplyMessage(void)
walsnd->apply = reply.apply;
SpinLockRelease(&walsnd->mutex);
}
SyncRepReleaseWaiters();
}
/*
@@ -700,6 +704,7 @@ WalSndLoop(void)
{
got_SIGHUP = false;
ProcessConfigFile(PGC_SIGHUP);
SyncRepInitConfig();
}
/*
@@ -771,7 +776,12 @@ WalSndLoop(void)
* that point might wait for some time.
*/
if (MyWalSnd->state == WALSNDSTATE_CATCHUP && caughtup)
{
ereport(DEBUG1,
(errmsg("standby \"%s\" has now caught up with primary",
application_name)));
WalSndSetState(WALSNDSTATE_STREAMING);
}
ProcessRepliesIfAny();
}
@@ -1238,6 +1248,8 @@ WalSndShmemInit(void)
/* First time through, so initialize */
MemSet(WalSndCtl, 0, WalSndShmemSize());
SHMQueueInit(&(WalSndCtl->SyncRepQueue));
for (i = 0; i < max_wal_senders; i++)
{
WalSnd *walsnd = &WalSndCtl->walsnds[i];
@@ -1304,12 +1316,15 @@ WalSndGetStateString(WalSndState state)
Datum
pg_stat_get_wal_senders(PG_FUNCTION_ARGS)
{
#define PG_STAT_GET_WAL_SENDERS_COLS 6
#define PG_STAT_GET_WAL_SENDERS_COLS 8
ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
TupleDesc tupdesc;
Tuplestorestate *tupstore;
MemoryContext per_query_ctx;
MemoryContext oldcontext;
int sync_priority[max_wal_senders];
int priority = 0;
int sync_standby = -1;
int i;
/* check to see if caller supports us returning a tuplestore */
@@ -1337,6 +1352,33 @@ pg_stat_get_wal_senders(PG_FUNCTION_ARGS)
MemoryContextSwitchTo(oldcontext);
/*
* Get the priorities of sync standbys all in one go, to minimise
* lock acquisitions and to allow us to evaluate who is the current
* sync standby. This code must match the code in SyncRepReleaseWaiters().
*/
LWLockAcquire(SyncRepLock, LW_SHARED);
for (i = 0; i < max_wal_senders; i++)
{
/* use volatile pointer to prevent code rearrangement */
volatile WalSnd *walsnd = &WalSndCtl->walsnds[i];
if (walsnd->pid != 0)
{
sync_priority[i] = walsnd->sync_standby_priority;
if (walsnd->state == WALSNDSTATE_STREAMING &&
walsnd->sync_standby_priority > 0 &&
(priority == 0 ||
priority > walsnd->sync_standby_priority))
{
priority = walsnd->sync_standby_priority;
sync_standby = i;
}
}
}
LWLockRelease(SyncRepLock);
for (i = 0; i < max_wal_senders; i++)
{
/* use volatile pointer to prevent code rearrangement */
@@ -1370,11 +1412,7 @@ pg_stat_get_wal_senders(PG_FUNCTION_ARGS)
* Only superusers can see details. Other users only get
* the pid value to know it's a walsender, but no details.
*/
nulls[1] = true;
nulls[2] = true;
nulls[3] = true;
nulls[4] = true;
nulls[5] = true;
MemSet(&nulls[1], true, PG_STAT_GET_WAL_SENDERS_COLS - 1);
}
else
{
@@ -1401,6 +1439,19 @@ pg_stat_get_wal_senders(PG_FUNCTION_ARGS)
snprintf(location, sizeof(location), "%X/%X",
apply.xlogid, apply.xrecoff);
values[5] = CStringGetTextDatum(location);
values[6] = Int32GetDatum(sync_priority[i]);
/*
* More easily understood version of standby state.
* This is purely informational, not different from priority.
*/
if (sync_priority[i] == 0)
values[7] = CStringGetTextDatum("ASYNC");
else if (i == sync_standby)
values[7] = CStringGetTextDatum("SYNC");
else
values[7] = CStringGetTextDatum("POTENTIAL");
}
tuplestore_putvalues(tupstore, tupdesc, values, nulls);