1
0
mirror of https://github.com/postgres/postgres.git synced 2025-09-03 15:22:11 +03:00

Introduce replication slots.

Replication slots are a crash-safe data structure which can be created
on either a master or a standby to prevent premature removal of
write-ahead log segments needed by a standby, as well as (with
hot_standby_feedback=on) pruning of tuples whose removal would cause
replication conflicts.  Slots have some advantages over existing
techniques, as explained in the documentation.

In a few places, we refer to the type of replication slots introduced
by this patch as "physical" slots, because forthcoming patches for
logical decoding will also have slots, but with somewhat different
properties.

Andres Freund and Robert Haas
This commit is contained in:
Robert Haas
2014-01-31 22:45:17 -05:00
parent 5bdef38b89
commit 858ec11858
42 changed files with 2356 additions and 62 deletions

View File

@@ -53,6 +53,7 @@
#include "miscadmin.h"
#include "nodes/replnodes.h"
#include "replication/basebackup.h"
#include "replication/slot.h"
#include "replication/syncrep.h"
#include "replication/walreceiver.h"
#include "replication/walsender.h"
@@ -218,12 +219,17 @@ InitWalSender(void)
void
WalSndErrorCleanup()
{
LWLockReleaseAll();
if (sendFile >= 0)
{
close(sendFile);
sendFile = -1;
}
if (MyReplicationSlot != NULL)
ReplicationSlotRelease();
replication_active = false;
if (walsender_ready_to_stop)
proc_exit(0);
@@ -421,6 +427,15 @@ StartReplication(StartReplicationCmd *cmd)
* written at wal_level='minimal'.
*/
if (cmd->slotname)
{
ReplicationSlotAcquire(cmd->slotname);
if (MyReplicationSlot->data.database != InvalidOid)
ereport(ERROR,
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
(errmsg("cannot use a replication slot created for changeset extraction for streaming replication"))));
}
/*
* Select the timeline. If it was given explicitly by the client, use
* that. Otherwise use the timeline of the last replayed record, which is
@@ -565,6 +580,9 @@ StartReplication(StartReplicationCmd *cmd)
Assert(streamingDoneSending && streamingDoneReceiving);
}
if (cmd->slotname)
ReplicationSlotRelease();
/*
* Copy is finished now. Send a single-row result set indicating the next
* timeline.
@@ -622,6 +640,75 @@ StartReplication(StartReplicationCmd *cmd)
pq_puttextmessage('C', "START_STREAMING");
}
/*
* Create a new replication slot.
*/
static void
CreateReplicationSlot(CreateReplicationSlotCmd *cmd)
{
const char *slot_name;
StringInfoData buf;
Assert(!MyReplicationSlot);
/* setup state for XLogReadPage */
sendTimeLineIsHistoric = false;
sendTimeLine = ThisTimeLineID;
ReplicationSlotCreate(cmd->slotname, cmd->kind == REPLICATION_KIND_LOGICAL);
initStringInfo(&output_message);
slot_name = NameStr(MyReplicationSlot->data.name);
/*
* It may seem somewhat pointless to send back the same slot name the
* client just requested and nothing else, but logical replication
* will add more fields here. (We could consider removing the slot
* name from what's sent back, though, since the client has specified
* that.)
*/
pq_beginmessage(&buf, 'T');
pq_sendint(&buf, 1, 2); /* 1 field */
/* first field: slot name */
pq_sendstring(&buf, "slot_name"); /* col name */
pq_sendint(&buf, 0, 4); /* table oid */
pq_sendint(&buf, 0, 2); /* attnum */
pq_sendint(&buf, TEXTOID, 4); /* type oid */
pq_sendint(&buf, -1, 2); /* typlen */
pq_sendint(&buf, 0, 4); /* typmod */
pq_sendint(&buf, 0, 2); /* format code */
pq_endmessage(&buf);
/* Send a DataRow message */
pq_beginmessage(&buf, 'D');
pq_sendint(&buf, 1, 2); /* # of columns */
/* slot_name */
pq_sendint(&buf, strlen(slot_name), 4); /* col1 len */
pq_sendbytes(&buf, slot_name, strlen(slot_name));
pq_endmessage(&buf);
/*
* release active status again, START_REPLICATION will reacquire it
*/
ReplicationSlotRelease();
}
/*
* Get rid of a replication slot that is no longer wanted.
*/
static void
DropReplicationSlot(DropReplicationSlotCmd *cmd)
{
ReplicationSlotDrop(cmd->slotname);
EndCommand("DROP_REPLICATION_SLOT", DestRemote);
}
/*
* Execute an incoming replication command.
*/
@@ -660,14 +747,28 @@ exec_replication_command(const char *cmd_string)
IdentifySystem();
break;
case T_StartReplicationCmd:
StartReplication((StartReplicationCmd *) cmd_node);
break;
case T_BaseBackupCmd:
SendBaseBackup((BaseBackupCmd *) cmd_node);
break;
case T_CreateReplicationSlotCmd:
CreateReplicationSlot((CreateReplicationSlotCmd *) cmd_node);
break;
case T_DropReplicationSlotCmd:
DropReplicationSlot((DropReplicationSlotCmd *) cmd_node);
break;
case T_StartReplicationCmd:
{
StartReplicationCmd *cmd = (StartReplicationCmd *) cmd_node;
if (cmd->kind == REPLICATION_KIND_PHYSICAL)
StartReplication(cmd);
else
elog(ERROR, "cannot handle changeset extraction yet");
break;
}
case T_TimeLineHistoryCmd:
SendTimeLineHistory((TimeLineHistoryCmd *) cmd_node);
break;
@@ -830,6 +931,39 @@ ProcessStandbyMessage(void)
}
}
/*
* Remember that a walreceiver just confirmed receipt of lsn `lsn`.
*/
static void
PhysicalConfirmReceivedLocation(XLogRecPtr lsn)
{
bool changed = false;
/* use volatile pointer to prevent code rearrangement */
volatile ReplicationSlot *slot = MyReplicationSlot;
Assert(lsn != InvalidXLogRecPtr);
SpinLockAcquire(&slot->mutex);
if (slot->data.restart_lsn != lsn)
{
changed = true;
slot->data.restart_lsn = lsn;
}
SpinLockRelease(&slot->mutex);
if (changed)
{
ReplicationSlotMarkDirty();
ReplicationSlotsComputeRequiredLSN();
}
/*
* One could argue that the slot should saved to disk now, but that'd be
* energy wasted - the worst lost information can do here is give us wrong
* information in a statistics view - we'll just potentially be more
* conservative in removing files.
*/
}
/*
* Regular reply from standby advising of WAL positions on standby server.
*/
@@ -875,6 +1009,48 @@ ProcessStandbyReplyMessage(void)
if (!am_cascading_walsender)
SyncRepReleaseWaiters();
/*
* Advance our local xmin horizon when the client confirmed a flush.
*/
if (MyReplicationSlot && flushPtr != InvalidXLogRecPtr)
{
if (MyReplicationSlot->data.database != InvalidOid)
elog(ERROR, "cannot handle changeset extraction yet");
else
PhysicalConfirmReceivedLocation(flushPtr);
}
}
/* compute new replication slot xmin horizon if needed */
static void
PhysicalReplicationSlotNewXmin(TransactionId feedbackXmin)
{
bool changed = false;
volatile ReplicationSlot *slot = MyReplicationSlot;
SpinLockAcquire(&slot->mutex);
MyPgXact->xmin = InvalidTransactionId;
/*
* For physical replication we don't need the the interlock provided
* by xmin and effective_xmin since the consequences of a missed increase
* are limited to query cancellations, so set both at once.
*/
if (!TransactionIdIsNormal(slot->data.xmin) ||
!TransactionIdIsNormal(feedbackXmin) ||
TransactionIdPrecedes(slot->data.xmin, feedbackXmin))
{
changed = true;
slot->data.xmin = feedbackXmin;
slot->effective_xmin = feedbackXmin;
}
SpinLockRelease(&slot->mutex);
if (changed)
{
ReplicationSlotMarkDirty();
ReplicationSlotsComputeRequiredXmin();
}
}
/*
@@ -904,6 +1080,8 @@ ProcessStandbyHSFeedbackMessage(void)
if (!TransactionIdIsNormal(feedbackXmin))
{
MyPgXact->xmin = InvalidTransactionId;
if (MyReplicationSlot != NULL)
PhysicalReplicationSlotNewXmin(feedbackXmin);
return;
}
@@ -951,8 +1129,17 @@ ProcessStandbyHSFeedbackMessage(void)
* GetOldestXmin. (If we're moving our xmin forward, this is obviously
* safe, and if we're moving it backwards, well, the data is at risk
* already since a VACUUM could have just finished calling GetOldestXmin.)
*
* If we're using a replication slot we reserve the xmin via that,
* otherwise via the walsender's PGXACT entry.
* XXX: It might make sense to introduce ephemeral slots and always use
* the slot mechanism.
*/
MyPgXact->xmin = feedbackXmin;
if (MyReplicationSlot != NULL) /* XXX: persistency configurable? */
PhysicalReplicationSlotNewXmin(feedbackXmin);
else
MyPgXact->xmin = feedbackXmin;
}
/* Main loop of walsender process that streams the WAL over Copy messages. */