mirror of
https://github.com/postgres/postgres.git
synced 2025-06-26 12:21:12 +03:00
Allow publishing partition changes via ancestors
To control whether partition changes are replicated using their own identity and schema or an ancestor's, add a new parameter that can be set per publication named 'publish_via_partition_root'. This allows replicating a partitioned table into a different partition structure on the subscriber. Author: Amit Langote <amitlangote09@gmail.com> Reviewed-by: Rafia Sabih <rafia.pghackers@gmail.com> Reviewed-by: Peter Eisentraut <peter.eisentraut@2ndquadrant.com> Reviewed-by: Petr Jelinek <petr@2ndquadrant.com> Discussion: https://www.postgresql.org/message-id/flat/CA+HiwqH=Y85vRK3mOdjEkqFK+E=ST=eQiHdpj43L=_eJMOOznQ@mail.gmail.com
This commit is contained in:
@ -12,6 +12,8 @@
|
||||
*/
|
||||
#include "postgres.h"
|
||||
|
||||
#include "access/tupconvert.h"
|
||||
#include "catalog/partition.h"
|
||||
#include "catalog/pg_publication.h"
|
||||
#include "fmgr.h"
|
||||
#include "replication/logical.h"
|
||||
@ -20,6 +22,7 @@
|
||||
#include "replication/pgoutput.h"
|
||||
#include "utils/int8.h"
|
||||
#include "utils/inval.h"
|
||||
#include "utils/lsyscache.h"
|
||||
#include "utils/memutils.h"
|
||||
#include "utils/syscache.h"
|
||||
#include "utils/varlena.h"
|
||||
@ -49,6 +52,7 @@ static bool publications_valid;
|
||||
static List *LoadPublications(List *pubnames);
|
||||
static void publication_invalidation_cb(Datum arg, int cacheid,
|
||||
uint32 hashvalue);
|
||||
static void send_relation_and_attrs(Relation relation, LogicalDecodingContext *ctx);
|
||||
|
||||
/*
|
||||
* Entry in the map used to remember which relation schemas we sent.
|
||||
@ -59,9 +63,31 @@ static void publication_invalidation_cb(Datum arg, int cacheid,
|
||||
typedef struct RelationSyncEntry
|
||||
{
|
||||
Oid relid; /* relation oid */
|
||||
bool schema_sent; /* did we send the schema? */
|
||||
|
||||
/*
|
||||
* Did we send the schema? If ancestor relid is set, its schema must also
|
||||
* have been sent for this to be true.
|
||||
*/
|
||||
bool schema_sent;
|
||||
|
||||
bool replicate_valid;
|
||||
PublicationActions pubactions;
|
||||
|
||||
/*
|
||||
* OID of the relation to publish changes as. For a partition, this may
|
||||
* be set to one of its ancestors whose schema will be used when
|
||||
* replicating changes, if publish_via_partition_root is set for the
|
||||
* publication.
|
||||
*/
|
||||
Oid publish_as_relid;
|
||||
|
||||
/*
|
||||
* Map used when replicating using an ancestor's schema to convert tuples
|
||||
* from partition's type to the ancestor's; NULL if publish_as_relid is
|
||||
* same as 'relid' or if unnecessary due to partition and the ancestor
|
||||
* having identical TupleDesc.
|
||||
*/
|
||||
TupleConversionMap *map;
|
||||
} RelationSyncEntry;
|
||||
|
||||
/* Map used to remember which relation schemas we sent. */
|
||||
@ -259,47 +285,71 @@ pgoutput_commit_txn(LogicalDecodingContext *ctx, ReorderBufferTXN *txn,
|
||||
}
|
||||
|
||||
/*
|
||||
* Write the relation schema if the current schema hasn't been sent yet.
|
||||
* Write the current schema of the relation and its ancestor (if any) if not
|
||||
* done yet.
|
||||
*/
|
||||
static void
|
||||
maybe_send_schema(LogicalDecodingContext *ctx,
|
||||
Relation relation, RelationSyncEntry *relentry)
|
||||
{
|
||||
if (!relentry->schema_sent)
|
||||
if (relentry->schema_sent)
|
||||
return;
|
||||
|
||||
/* If needed, send the ancestor's schema first. */
|
||||
if (relentry->publish_as_relid != RelationGetRelid(relation))
|
||||
{
|
||||
TupleDesc desc;
|
||||
int i;
|
||||
Relation ancestor = RelationIdGetRelation(relentry->publish_as_relid);
|
||||
TupleDesc indesc = RelationGetDescr(relation);
|
||||
TupleDesc outdesc = RelationGetDescr(ancestor);
|
||||
MemoryContext oldctx;
|
||||
|
||||
desc = RelationGetDescr(relation);
|
||||
/* Map must live as long as the session does. */
|
||||
oldctx = MemoryContextSwitchTo(CacheMemoryContext);
|
||||
relentry->map = convert_tuples_by_name(indesc, outdesc);
|
||||
MemoryContextSwitchTo(oldctx);
|
||||
send_relation_and_attrs(ancestor, ctx);
|
||||
RelationClose(ancestor);
|
||||
}
|
||||
|
||||
/*
|
||||
* Write out type info if needed. We do that only for user-created
|
||||
* types. We use FirstGenbkiObjectId as the cutoff, so that we only
|
||||
* consider objects with hand-assigned OIDs to be "built in", not for
|
||||
* instance any function or type defined in the information_schema.
|
||||
* This is important because only hand-assigned OIDs can be expected
|
||||
* to remain stable across major versions.
|
||||
*/
|
||||
for (i = 0; i < desc->natts; i++)
|
||||
{
|
||||
Form_pg_attribute att = TupleDescAttr(desc, i);
|
||||
send_relation_and_attrs(relation, ctx);
|
||||
relentry->schema_sent = true;
|
||||
}
|
||||
|
||||
if (att->attisdropped || att->attgenerated)
|
||||
continue;
|
||||
/*
|
||||
* Sends a relation
|
||||
*/
|
||||
static void
|
||||
send_relation_and_attrs(Relation relation, LogicalDecodingContext *ctx)
|
||||
{
|
||||
TupleDesc desc = RelationGetDescr(relation);
|
||||
int i;
|
||||
|
||||
if (att->atttypid < FirstGenbkiObjectId)
|
||||
continue;
|
||||
/*
|
||||
* Write out type info if needed. We do that only for user-created types.
|
||||
* We use FirstGenbkiObjectId as the cutoff, so that we only consider
|
||||
* objects with hand-assigned OIDs to be "built in", not for instance any
|
||||
* function or type defined in the information_schema. This is important
|
||||
* because only hand-assigned OIDs can be expected to remain stable across
|
||||
* major versions.
|
||||
*/
|
||||
for (i = 0; i < desc->natts; i++)
|
||||
{
|
||||
Form_pg_attribute att = TupleDescAttr(desc, i);
|
||||
|
||||
OutputPluginPrepareWrite(ctx, false);
|
||||
logicalrep_write_typ(ctx->out, att->atttypid);
|
||||
OutputPluginWrite(ctx, false);
|
||||
}
|
||||
if (att->attisdropped || att->attgenerated)
|
||||
continue;
|
||||
|
||||
if (att->atttypid < FirstGenbkiObjectId)
|
||||
continue;
|
||||
|
||||
OutputPluginPrepareWrite(ctx, false);
|
||||
logicalrep_write_rel(ctx->out, relation);
|
||||
logicalrep_write_typ(ctx->out, att->atttypid);
|
||||
OutputPluginWrite(ctx, false);
|
||||
relentry->schema_sent = true;
|
||||
}
|
||||
|
||||
OutputPluginPrepareWrite(ctx, false);
|
||||
logicalrep_write_rel(ctx->out, relation);
|
||||
OutputPluginWrite(ctx, false);
|
||||
}
|
||||
|
||||
/*
|
||||
@ -346,28 +396,65 @@ pgoutput_change(LogicalDecodingContext *ctx, ReorderBufferTXN *txn,
|
||||
switch (change->action)
|
||||
{
|
||||
case REORDER_BUFFER_CHANGE_INSERT:
|
||||
OutputPluginPrepareWrite(ctx, true);
|
||||
logicalrep_write_insert(ctx->out, relation,
|
||||
&change->data.tp.newtuple->tuple);
|
||||
OutputPluginWrite(ctx, true);
|
||||
break;
|
||||
{
|
||||
HeapTuple tuple = &change->data.tp.newtuple->tuple;
|
||||
|
||||
/* Switch relation if publishing via root. */
|
||||
if (relentry->publish_as_relid != RelationGetRelid(relation))
|
||||
{
|
||||
Assert(relation->rd_rel->relispartition);
|
||||
relation = RelationIdGetRelation(relentry->publish_as_relid);
|
||||
/* Convert tuple if needed. */
|
||||
if (relentry->map)
|
||||
tuple = execute_attr_map_tuple(tuple, relentry->map);
|
||||
}
|
||||
|
||||
OutputPluginPrepareWrite(ctx, true);
|
||||
logicalrep_write_insert(ctx->out, relation, tuple);
|
||||
OutputPluginWrite(ctx, true);
|
||||
break;
|
||||
}
|
||||
case REORDER_BUFFER_CHANGE_UPDATE:
|
||||
{
|
||||
HeapTuple oldtuple = change->data.tp.oldtuple ?
|
||||
&change->data.tp.oldtuple->tuple : NULL;
|
||||
HeapTuple newtuple = &change->data.tp.newtuple->tuple;
|
||||
|
||||
/* Switch relation if publishing via root. */
|
||||
if (relentry->publish_as_relid != RelationGetRelid(relation))
|
||||
{
|
||||
Assert(relation->rd_rel->relispartition);
|
||||
relation = RelationIdGetRelation(relentry->publish_as_relid);
|
||||
/* Convert tuples if needed. */
|
||||
if (relentry->map)
|
||||
{
|
||||
oldtuple = execute_attr_map_tuple(oldtuple, relentry->map);
|
||||
newtuple = execute_attr_map_tuple(newtuple, relentry->map);
|
||||
}
|
||||
}
|
||||
|
||||
OutputPluginPrepareWrite(ctx, true);
|
||||
logicalrep_write_update(ctx->out, relation, oldtuple,
|
||||
&change->data.tp.newtuple->tuple);
|
||||
logicalrep_write_update(ctx->out, relation, oldtuple, newtuple);
|
||||
OutputPluginWrite(ctx, true);
|
||||
break;
|
||||
}
|
||||
case REORDER_BUFFER_CHANGE_DELETE:
|
||||
if (change->data.tp.oldtuple)
|
||||
{
|
||||
HeapTuple oldtuple = &change->data.tp.oldtuple->tuple;
|
||||
|
||||
/* Switch relation if publishing via root. */
|
||||
if (relentry->publish_as_relid != RelationGetRelid(relation))
|
||||
{
|
||||
Assert(relation->rd_rel->relispartition);
|
||||
relation = RelationIdGetRelation(relentry->publish_as_relid);
|
||||
/* Convert tuple if needed. */
|
||||
if (relentry->map)
|
||||
oldtuple = execute_attr_map_tuple(oldtuple, relentry->map);
|
||||
}
|
||||
|
||||
OutputPluginPrepareWrite(ctx, true);
|
||||
logicalrep_write_delete(ctx->out, relation,
|
||||
&change->data.tp.oldtuple->tuple);
|
||||
logicalrep_write_delete(ctx->out, relation, oldtuple);
|
||||
OutputPluginWrite(ctx, true);
|
||||
}
|
||||
else
|
||||
@ -412,10 +499,11 @@ pgoutput_truncate(LogicalDecodingContext *ctx, ReorderBufferTXN *txn,
|
||||
continue;
|
||||
|
||||
/*
|
||||
* Don't send partitioned tables, because partitions should be sent
|
||||
* instead.
|
||||
* Don't send partitions if the publication wants to send only the
|
||||
* root tables through it.
|
||||
*/
|
||||
if (relation->rd_rel->relkind == RELKIND_PARTITIONED_TABLE)
|
||||
if (relation->rd_rel->relispartition &&
|
||||
relentry->publish_as_relid != relid)
|
||||
continue;
|
||||
|
||||
relids[nrelids++] = relid;
|
||||
@ -540,12 +628,15 @@ init_rel_sync_cache(MemoryContext cachectx)
|
||||
* This looks up publications that the given relation is directly or
|
||||
* indirectly part of (the latter if it's really the relation's ancestor that
|
||||
* is part of a publication) and fills up the found entry with the information
|
||||
* about which operations to publish.
|
||||
* about which operations to publish and whether to use an ancestor's schema
|
||||
* when publishing.
|
||||
*/
|
||||
static RelationSyncEntry *
|
||||
get_rel_sync_entry(PGOutputData *data, Oid relid)
|
||||
{
|
||||
RelationSyncEntry *entry;
|
||||
bool am_partition = get_rel_relispartition(relid);
|
||||
char relkind = get_rel_relkind(relid);
|
||||
bool found;
|
||||
MemoryContext oldctx;
|
||||
|
||||
@ -564,6 +655,7 @@ get_rel_sync_entry(PGOutputData *data, Oid relid)
|
||||
{
|
||||
List *pubids = GetRelationPublications(relid);
|
||||
ListCell *lc;
|
||||
Oid publish_as_relid = relid;
|
||||
|
||||
/* Reload publications if needed before use. */
|
||||
if (!publications_valid)
|
||||
@ -588,8 +680,56 @@ get_rel_sync_entry(PGOutputData *data, Oid relid)
|
||||
foreach(lc, data->publications)
|
||||
{
|
||||
Publication *pub = lfirst(lc);
|
||||
bool publish = false;
|
||||
|
||||
if (pub->alltables || list_member_oid(pubids, pub->oid))
|
||||
if (pub->alltables)
|
||||
{
|
||||
publish = true;
|
||||
if (pub->pubviaroot && am_partition)
|
||||
publish_as_relid = llast_oid(get_partition_ancestors(relid));
|
||||
}
|
||||
|
||||
if (!publish)
|
||||
{
|
||||
bool ancestor_published = false;
|
||||
|
||||
/*
|
||||
* For a partition, check if any of the ancestors are
|
||||
* published. If so, note down the topmost ancestor that is
|
||||
* published via this publication, which will be used as the
|
||||
* relation via which to publish the partition's changes.
|
||||
*/
|
||||
if (am_partition)
|
||||
{
|
||||
List *ancestors = get_partition_ancestors(relid);
|
||||
ListCell *lc2;
|
||||
|
||||
/* Find the "topmost" ancestor that is in this publication. */
|
||||
foreach(lc2, ancestors)
|
||||
{
|
||||
Oid ancestor = lfirst_oid(lc2);
|
||||
|
||||
if (list_member_oid(GetRelationPublications(ancestor),
|
||||
pub->oid))
|
||||
{
|
||||
ancestor_published = true;
|
||||
if (pub->pubviaroot)
|
||||
publish_as_relid = ancestor;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (list_member_oid(pubids, pub->oid) || ancestor_published)
|
||||
publish = true;
|
||||
}
|
||||
|
||||
/*
|
||||
* Don't publish changes for partitioned tables, because
|
||||
* publishing those of its partitions suffices, unless partition
|
||||
* changes won't be published due to pubviaroot being set.
|
||||
*/
|
||||
if (publish &&
|
||||
(relkind != RELKIND_PARTITIONED_TABLE || pub->pubviaroot))
|
||||
{
|
||||
entry->pubactions.pubinsert |= pub->pubactions.pubinsert;
|
||||
entry->pubactions.pubupdate |= pub->pubactions.pubupdate;
|
||||
@ -604,6 +744,7 @@ get_rel_sync_entry(PGOutputData *data, Oid relid)
|
||||
|
||||
list_free(pubids);
|
||||
|
||||
entry->publish_as_relid = publish_as_relid;
|
||||
entry->replicate_valid = true;
|
||||
}
|
||||
|
||||
|
Reference in New Issue
Block a user