1
0
mirror of https://github.com/postgres/postgres.git synced 2025-06-26 12:21:12 +03:00

Allow publishing partition changes via ancestors

To control whether partition changes are replicated using their own
identity and schema or an ancestor's, add a new parameter that can be
set per publication named 'publish_via_partition_root'.

This allows replicating a partitioned table into a different partition
structure on the subscriber.

Author: Amit Langote <amitlangote09@gmail.com>
Reviewed-by: Rafia Sabih <rafia.pghackers@gmail.com>
Reviewed-by: Peter Eisentraut <peter.eisentraut@2ndquadrant.com>
Reviewed-by: Petr Jelinek <petr@2ndquadrant.com>
Discussion: https://www.postgresql.org/message-id/flat/CA+HiwqH=Y85vRK3mOdjEkqFK+E=ST=eQiHdpj43L=_eJMOOznQ@mail.gmail.com
This commit is contained in:
Peter Eisentraut
2020-04-08 09:59:27 +02:00
parent 1aac32df89
commit 83fd4532a7
15 changed files with 724 additions and 174 deletions

View File

@ -12,6 +12,8 @@
*/
#include "postgres.h"
#include "access/tupconvert.h"
#include "catalog/partition.h"
#include "catalog/pg_publication.h"
#include "fmgr.h"
#include "replication/logical.h"
@ -20,6 +22,7 @@
#include "replication/pgoutput.h"
#include "utils/int8.h"
#include "utils/inval.h"
#include "utils/lsyscache.h"
#include "utils/memutils.h"
#include "utils/syscache.h"
#include "utils/varlena.h"
@ -49,6 +52,7 @@ static bool publications_valid;
static List *LoadPublications(List *pubnames);
static void publication_invalidation_cb(Datum arg, int cacheid,
uint32 hashvalue);
static void send_relation_and_attrs(Relation relation, LogicalDecodingContext *ctx);
/*
* Entry in the map used to remember which relation schemas we sent.
@ -59,9 +63,31 @@ static void publication_invalidation_cb(Datum arg, int cacheid,
typedef struct RelationSyncEntry
{
Oid relid; /* relation oid */
bool schema_sent; /* did we send the schema? */
/*
* Did we send the schema? If ancestor relid is set, its schema must also
* have been sent for this to be true.
*/
bool schema_sent;
bool replicate_valid;
PublicationActions pubactions;
/*
* OID of the relation to publish changes as. For a partition, this may
* be set to one of its ancestors whose schema will be used when
* replicating changes, if publish_via_partition_root is set for the
* publication.
*/
Oid publish_as_relid;
/*
* Map used when replicating using an ancestor's schema to convert tuples
* from partition's type to the ancestor's; NULL if publish_as_relid is
* same as 'relid' or if unnecessary due to partition and the ancestor
* having identical TupleDesc.
*/
TupleConversionMap *map;
} RelationSyncEntry;
/* Map used to remember which relation schemas we sent. */
@ -259,47 +285,71 @@ pgoutput_commit_txn(LogicalDecodingContext *ctx, ReorderBufferTXN *txn,
}
/*
* Write the relation schema if the current schema hasn't been sent yet.
* Write the current schema of the relation and its ancestor (if any) if not
* done yet.
*/
static void
maybe_send_schema(LogicalDecodingContext *ctx,
Relation relation, RelationSyncEntry *relentry)
{
if (!relentry->schema_sent)
if (relentry->schema_sent)
return;
/* If needed, send the ancestor's schema first. */
if (relentry->publish_as_relid != RelationGetRelid(relation))
{
TupleDesc desc;
int i;
Relation ancestor = RelationIdGetRelation(relentry->publish_as_relid);
TupleDesc indesc = RelationGetDescr(relation);
TupleDesc outdesc = RelationGetDescr(ancestor);
MemoryContext oldctx;
desc = RelationGetDescr(relation);
/* Map must live as long as the session does. */
oldctx = MemoryContextSwitchTo(CacheMemoryContext);
relentry->map = convert_tuples_by_name(indesc, outdesc);
MemoryContextSwitchTo(oldctx);
send_relation_and_attrs(ancestor, ctx);
RelationClose(ancestor);
}
/*
* Write out type info if needed. We do that only for user-created
* types. We use FirstGenbkiObjectId as the cutoff, so that we only
* consider objects with hand-assigned OIDs to be "built in", not for
* instance any function or type defined in the information_schema.
* This is important because only hand-assigned OIDs can be expected
* to remain stable across major versions.
*/
for (i = 0; i < desc->natts; i++)
{
Form_pg_attribute att = TupleDescAttr(desc, i);
send_relation_and_attrs(relation, ctx);
relentry->schema_sent = true;
}
if (att->attisdropped || att->attgenerated)
continue;
/*
* Sends a relation
*/
static void
send_relation_and_attrs(Relation relation, LogicalDecodingContext *ctx)
{
TupleDesc desc = RelationGetDescr(relation);
int i;
if (att->atttypid < FirstGenbkiObjectId)
continue;
/*
* Write out type info if needed. We do that only for user-created types.
* We use FirstGenbkiObjectId as the cutoff, so that we only consider
* objects with hand-assigned OIDs to be "built in", not for instance any
* function or type defined in the information_schema. This is important
* because only hand-assigned OIDs can be expected to remain stable across
* major versions.
*/
for (i = 0; i < desc->natts; i++)
{
Form_pg_attribute att = TupleDescAttr(desc, i);
OutputPluginPrepareWrite(ctx, false);
logicalrep_write_typ(ctx->out, att->atttypid);
OutputPluginWrite(ctx, false);
}
if (att->attisdropped || att->attgenerated)
continue;
if (att->atttypid < FirstGenbkiObjectId)
continue;
OutputPluginPrepareWrite(ctx, false);
logicalrep_write_rel(ctx->out, relation);
logicalrep_write_typ(ctx->out, att->atttypid);
OutputPluginWrite(ctx, false);
relentry->schema_sent = true;
}
OutputPluginPrepareWrite(ctx, false);
logicalrep_write_rel(ctx->out, relation);
OutputPluginWrite(ctx, false);
}
/*
@ -346,28 +396,65 @@ pgoutput_change(LogicalDecodingContext *ctx, ReorderBufferTXN *txn,
switch (change->action)
{
case REORDER_BUFFER_CHANGE_INSERT:
OutputPluginPrepareWrite(ctx, true);
logicalrep_write_insert(ctx->out, relation,
&change->data.tp.newtuple->tuple);
OutputPluginWrite(ctx, true);
break;
{
HeapTuple tuple = &change->data.tp.newtuple->tuple;
/* Switch relation if publishing via root. */
if (relentry->publish_as_relid != RelationGetRelid(relation))
{
Assert(relation->rd_rel->relispartition);
relation = RelationIdGetRelation(relentry->publish_as_relid);
/* Convert tuple if needed. */
if (relentry->map)
tuple = execute_attr_map_tuple(tuple, relentry->map);
}
OutputPluginPrepareWrite(ctx, true);
logicalrep_write_insert(ctx->out, relation, tuple);
OutputPluginWrite(ctx, true);
break;
}
case REORDER_BUFFER_CHANGE_UPDATE:
{
HeapTuple oldtuple = change->data.tp.oldtuple ?
&change->data.tp.oldtuple->tuple : NULL;
HeapTuple newtuple = &change->data.tp.newtuple->tuple;
/* Switch relation if publishing via root. */
if (relentry->publish_as_relid != RelationGetRelid(relation))
{
Assert(relation->rd_rel->relispartition);
relation = RelationIdGetRelation(relentry->publish_as_relid);
/* Convert tuples if needed. */
if (relentry->map)
{
oldtuple = execute_attr_map_tuple(oldtuple, relentry->map);
newtuple = execute_attr_map_tuple(newtuple, relentry->map);
}
}
OutputPluginPrepareWrite(ctx, true);
logicalrep_write_update(ctx->out, relation, oldtuple,
&change->data.tp.newtuple->tuple);
logicalrep_write_update(ctx->out, relation, oldtuple, newtuple);
OutputPluginWrite(ctx, true);
break;
}
case REORDER_BUFFER_CHANGE_DELETE:
if (change->data.tp.oldtuple)
{
HeapTuple oldtuple = &change->data.tp.oldtuple->tuple;
/* Switch relation if publishing via root. */
if (relentry->publish_as_relid != RelationGetRelid(relation))
{
Assert(relation->rd_rel->relispartition);
relation = RelationIdGetRelation(relentry->publish_as_relid);
/* Convert tuple if needed. */
if (relentry->map)
oldtuple = execute_attr_map_tuple(oldtuple, relentry->map);
}
OutputPluginPrepareWrite(ctx, true);
logicalrep_write_delete(ctx->out, relation,
&change->data.tp.oldtuple->tuple);
logicalrep_write_delete(ctx->out, relation, oldtuple);
OutputPluginWrite(ctx, true);
}
else
@ -412,10 +499,11 @@ pgoutput_truncate(LogicalDecodingContext *ctx, ReorderBufferTXN *txn,
continue;
/*
* Don't send partitioned tables, because partitions should be sent
* instead.
* Don't send partitions if the publication wants to send only the
* root tables through it.
*/
if (relation->rd_rel->relkind == RELKIND_PARTITIONED_TABLE)
if (relation->rd_rel->relispartition &&
relentry->publish_as_relid != relid)
continue;
relids[nrelids++] = relid;
@ -540,12 +628,15 @@ init_rel_sync_cache(MemoryContext cachectx)
* This looks up publications that the given relation is directly or
* indirectly part of (the latter if it's really the relation's ancestor that
* is part of a publication) and fills up the found entry with the information
* about which operations to publish.
* about which operations to publish and whether to use an ancestor's schema
* when publishing.
*/
static RelationSyncEntry *
get_rel_sync_entry(PGOutputData *data, Oid relid)
{
RelationSyncEntry *entry;
bool am_partition = get_rel_relispartition(relid);
char relkind = get_rel_relkind(relid);
bool found;
MemoryContext oldctx;
@ -564,6 +655,7 @@ get_rel_sync_entry(PGOutputData *data, Oid relid)
{
List *pubids = GetRelationPublications(relid);
ListCell *lc;
Oid publish_as_relid = relid;
/* Reload publications if needed before use. */
if (!publications_valid)
@ -588,8 +680,56 @@ get_rel_sync_entry(PGOutputData *data, Oid relid)
foreach(lc, data->publications)
{
Publication *pub = lfirst(lc);
bool publish = false;
if (pub->alltables || list_member_oid(pubids, pub->oid))
if (pub->alltables)
{
publish = true;
if (pub->pubviaroot && am_partition)
publish_as_relid = llast_oid(get_partition_ancestors(relid));
}
if (!publish)
{
bool ancestor_published = false;
/*
* For a partition, check if any of the ancestors are
* published. If so, note down the topmost ancestor that is
* published via this publication, which will be used as the
* relation via which to publish the partition's changes.
*/
if (am_partition)
{
List *ancestors = get_partition_ancestors(relid);
ListCell *lc2;
/* Find the "topmost" ancestor that is in this publication. */
foreach(lc2, ancestors)
{
Oid ancestor = lfirst_oid(lc2);
if (list_member_oid(GetRelationPublications(ancestor),
pub->oid))
{
ancestor_published = true;
if (pub->pubviaroot)
publish_as_relid = ancestor;
}
}
}
if (list_member_oid(pubids, pub->oid) || ancestor_published)
publish = true;
}
/*
* Don't publish changes for partitioned tables, because
* publishing those of its partitions suffices, unless partition
* changes won't be published due to pubviaroot being set.
*/
if (publish &&
(relkind != RELKIND_PARTITIONED_TABLE || pub->pubviaroot))
{
entry->pubactions.pubinsert |= pub->pubactions.pubinsert;
entry->pubactions.pubupdate |= pub->pubactions.pubupdate;
@ -604,6 +744,7 @@ get_rel_sync_entry(PGOutputData *data, Oid relid)
list_free(pubids);
entry->publish_as_relid = publish_as_relid;
entry->replicate_valid = true;
}