1
0
mirror of https://github.com/postgres/postgres.git synced 2025-07-03 20:02:46 +03:00

Implement table partitioning.

Table partitioning is like table inheritance and reuses much of the
existing infrastructure, but there are some important differences.
The parent is called a partitioned table and is always empty; it may
not have indexes or non-inherited constraints, since those make no
sense for a relation with no data of its own.  The children are called
partitions and contain all of the actual data.  Each partition has an
implicit partitioning constraint.  Multiple inheritance is not
allowed, and partitioning and inheritance can't be mixed.  Partitions
can't have extra columns and may not allow nulls unless the parent
does.  Tuples inserted into the parent are automatically routed to the
correct partition, so tuple-routing ON INSERT triggers are not needed.
Tuple routing isn't yet supported for partitions which are foreign
tables, and it doesn't handle updates that cross partition boundaries.

Currently, tables can be range-partitioned or list-partitioned.  List
partitioning is limited to a single column, but range partitioning can
involve multiple columns.  A partitioning "column" can be an
expression.

Because table partitioning is less general than table inheritance, it
is hoped that it will be easier to reason about properties of
partitions, and therefore that this will serve as a better foundation
for a variety of possible optimizations, including query planner
optimizations.  The tuple routing based which this patch does based on
the implicit partitioning constraints is an example of this, but it
seems likely that many other useful optimizations are also possible.

Amit Langote, reviewed and tested by Robert Haas, Ashutosh Bapat,
Amit Kapila, Rajkumar Raghuwanshi, Corey Huinker, Jaime Casanova,
Rushabh Lathia, Erik Rijkers, among others.  Minor revisions by me.
This commit is contained in:
Robert Haas
2016-12-07 13:17:43 -05:00
parent b7e1ae2328
commit f0e44751d7
85 changed files with 8896 additions and 281 deletions

View File

@ -32,6 +32,7 @@
#include "access/htup_details.h"
#include "access/multixact.h"
#include "access/nbtree.h"
#include "access/reloptions.h"
#include "access/sysattr.h"
#include "access/xact.h"
@ -40,6 +41,7 @@
#include "catalog/index.h"
#include "catalog/indexing.h"
#include "catalog/namespace.h"
#include "catalog/partition.h"
#include "catalog/pg_am.h"
#include "catalog/pg_amproc.h"
#include "catalog/pg_attrdef.h"
@ -49,6 +51,7 @@
#include "catalog/pg_database.h"
#include "catalog/pg_namespace.h"
#include "catalog/pg_opclass.h"
#include "catalog/pg_partitioned_table.h"
#include "catalog/pg_proc.h"
#include "catalog/pg_rewrite.h"
#include "catalog/pg_shseclabel.h"
@ -258,6 +261,8 @@ static HeapTuple ScanPgRelation(Oid targetRelId, bool indexOK, bool force_non_hi
static Relation AllocateRelationDesc(Form_pg_class relp);
static void RelationParseRelOptions(Relation relation, HeapTuple tuple);
static void RelationBuildTupleDesc(Relation relation);
static void RelationBuildPartitionKey(Relation relation);
static PartitionKey copy_partition_key(PartitionKey fromkey);
static Relation RelationBuildDesc(Oid targetRelId, bool insertIt);
static void RelationInitPhysicalAddr(Relation relation);
static void load_critical_index(Oid indexoid, Oid heapoid);
@ -278,6 +283,8 @@ static OpClassCacheEnt *LookupOpclassInfo(Oid operatorClassOid,
StrategyNumber numSupport);
static void RelationCacheInitFileRemoveInDir(const char *tblspcpath);
static void unlink_initfile(const char *initfilename);
static bool equalPartitionDescs(PartitionKey key, PartitionDesc partdesc1,
PartitionDesc partdesc2);
/*
@ -435,6 +442,7 @@ RelationParseRelOptions(Relation relation, HeapTuple tuple)
case RELKIND_INDEX:
case RELKIND_VIEW:
case RELKIND_MATVIEW:
case RELKIND_PARTITIONED_TABLE:
break;
default:
return;
@ -795,6 +803,237 @@ RelationBuildRuleLock(Relation relation)
relation->rd_rules = rulelock;
}
/*
* RelationBuildPartitionKey
* Build and attach to relcache partition key data of relation
*
* Partitioning key data is stored in CacheMemoryContext to ensure it survives
* as long as the relcache. To avoid leaking memory in that context in case
* of an error partway through this function, we build the structure in the
* working context (which must be short-lived) and copy the completed
* structure into the cache memory.
*
* Also, since the structure being created here is sufficiently complex, we
* make a private child context of CacheMemoryContext for each relation that
* has associated partition key information. That means no complicated logic
* to free individual elements whenever the relcache entry is flushed - just
* delete the context.
*/
static void
RelationBuildPartitionKey(Relation relation)
{
Form_pg_partitioned_table form;
HeapTuple tuple;
bool isnull;
int i;
PartitionKey key;
AttrNumber *attrs;
oidvector *opclass;
oidvector *collation;
ListCell *partexprs_item;
Datum datum;
MemoryContext partkeycxt,
oldcxt;
tuple = SearchSysCache1(PARTRELID,
ObjectIdGetDatum(RelationGetRelid(relation)));
/*
* The following happens when we have created our pg_class entry but not
* the pg_partitioned_table entry yet.
*/
if (!HeapTupleIsValid(tuple))
return;
key = (PartitionKey) palloc0(sizeof(PartitionKeyData));
/* Fixed-length attributes */
form = (Form_pg_partitioned_table) GETSTRUCT(tuple);
key->strategy = form->partstrat;
key->partnatts = form->partnatts;
/*
* We can rely on the first variable-length attribute being mapped to the
* relevant field of the catalog's C struct, because all previous
* attributes are non-nullable and fixed-length.
*/
attrs = form->partattrs.values;
/* But use the hard way to retrieve further variable-length attributes */
/* Operator class */
datum = SysCacheGetAttr(PARTRELID, tuple,
Anum_pg_partitioned_table_partclass, &isnull);
Assert(!isnull);
opclass = (oidvector *) DatumGetPointer(datum);
/* Collation */
datum = SysCacheGetAttr(PARTRELID, tuple,
Anum_pg_partitioned_table_partcollation, &isnull);
Assert(!isnull);
collation = (oidvector *) DatumGetPointer(datum);
/* Expressions */
datum = SysCacheGetAttr(PARTRELID, tuple,
Anum_pg_partitioned_table_partexprs, &isnull);
if (!isnull)
{
char *exprString;
Node *expr;
exprString = TextDatumGetCString(datum);
expr = stringToNode(exprString);
pfree(exprString);
/*
* Run the expressions through const-simplification since the planner
* will be comparing them to similarly-processed qual clause operands,
* and may fail to detect valid matches without this step. We don't
* need to bother with canonicalize_qual() though, because partition
* expressions are not full-fledged qualification clauses.
*/
expr = eval_const_expressions(NULL, (Node *) expr);
/* May as well fix opfuncids too */
fix_opfuncids((Node *) expr);
key->partexprs = (List *) expr;
}
key->partattrs = (AttrNumber *) palloc0(key->partnatts * sizeof(AttrNumber));
key->partopfamily = (Oid *) palloc0(key->partnatts * sizeof(Oid));
key->partopcintype = (Oid *) palloc0(key->partnatts * sizeof(Oid));
key->partsupfunc = (FmgrInfo *) palloc0(key->partnatts * sizeof(FmgrInfo));
key->partcollation = (Oid *) palloc0(key->partnatts * sizeof(Oid));
/* Gather type and collation info as well */
key->parttypid = (Oid *) palloc0(key->partnatts * sizeof(Oid));
key->parttypmod = (int32 *) palloc0(key->partnatts * sizeof(int32));
key->parttyplen = (int16 *) palloc0(key->partnatts * sizeof(int16));
key->parttypbyval = (bool *) palloc0(key->partnatts * sizeof(bool));
key->parttypalign = (char *) palloc0(key->partnatts * sizeof(char));
key->parttypcoll = (Oid *) palloc0(key->partnatts * sizeof(Oid));
/* Copy partattrs and fill other per-attribute info */
memcpy(key->partattrs, attrs, key->partnatts * sizeof(int16));
partexprs_item = list_head(key->partexprs);
for (i = 0; i < key->partnatts; i++)
{
AttrNumber attno = key->partattrs[i];
HeapTuple opclasstup;
Form_pg_opclass opclassform;
Oid funcid;
/* Collect opfamily information */
opclasstup = SearchSysCache1(CLAOID,
ObjectIdGetDatum(opclass->values[i]));
if (!HeapTupleIsValid(opclasstup))
elog(ERROR, "cache lookup failed for opclass %u", opclass->values[i]);
opclassform = (Form_pg_opclass) GETSTRUCT(opclasstup);
key->partopfamily[i] = opclassform->opcfamily;
key->partopcintype[i] = opclassform->opcintype;
/*
* A btree support function covers the cases of list and range methods
* currently supported.
*/
funcid = get_opfamily_proc(opclassform->opcfamily,
opclassform->opcintype,
opclassform->opcintype,
BTORDER_PROC);
fmgr_info(funcid, &key->partsupfunc[i]);
/* Collation */
key->partcollation[i] = collation->values[i];
/* Collect type information */
if (attno != 0)
{
key->parttypid[i] = relation->rd_att->attrs[attno - 1]->atttypid;
key->parttypmod[i] = relation->rd_att->attrs[attno - 1]->atttypmod;
key->parttypcoll[i] = relation->rd_att->attrs[attno - 1]->attcollation;
}
else
{
key->parttypid[i] = exprType(lfirst(partexprs_item));
key->parttypmod[i] = exprTypmod(lfirst(partexprs_item));
key->parttypcoll[i] = exprCollation(lfirst(partexprs_item));
}
get_typlenbyvalalign(key->parttypid[i],
&key->parttyplen[i],
&key->parttypbyval[i],
&key->parttypalign[i]);
ReleaseSysCache(opclasstup);
}
ReleaseSysCache(tuple);
/* Success --- now copy to the cache memory */
partkeycxt = AllocSetContextCreate(CacheMemoryContext,
RelationGetRelationName(relation),
ALLOCSET_SMALL_SIZES);
relation->rd_partkeycxt = partkeycxt;
oldcxt = MemoryContextSwitchTo(relation->rd_partkeycxt);
relation->rd_partkey = copy_partition_key(key);
MemoryContextSwitchTo(oldcxt);
}
/*
* copy_partition_key
*
* The copy is allocated in the current memory context.
*/
static PartitionKey
copy_partition_key(PartitionKey fromkey)
{
PartitionKey newkey;
int n;
newkey = (PartitionKey) palloc(sizeof(PartitionKeyData));
newkey->strategy = fromkey->strategy;
newkey->partnatts = n = fromkey->partnatts;
newkey->partattrs = (AttrNumber *) palloc(n * sizeof(AttrNumber));
memcpy(newkey->partattrs, fromkey->partattrs, n * sizeof(AttrNumber));
newkey->partexprs = copyObject(fromkey->partexprs);
newkey->partopfamily = (Oid *) palloc(n * sizeof(Oid));
memcpy(newkey->partopfamily, fromkey->partopfamily, n * sizeof(Oid));
newkey->partopcintype = (Oid *) palloc(n * sizeof(Oid));
memcpy(newkey->partopcintype, fromkey->partopcintype, n * sizeof(Oid));
newkey->partsupfunc = (FmgrInfo *) palloc(n * sizeof(FmgrInfo));
memcpy(newkey->partsupfunc, fromkey->partsupfunc, n * sizeof(FmgrInfo));
newkey->partcollation = (Oid *) palloc(n * sizeof(Oid));
memcpy(newkey->partcollation, fromkey->partcollation, n * sizeof(Oid));
newkey->parttypid = (Oid *) palloc(n * sizeof(Oid));
memcpy(newkey->parttypid, fromkey->parttypid, n * sizeof(Oid));
newkey->parttypmod = (int32 *) palloc(n * sizeof(int32));
memcpy(newkey->parttypmod, fromkey->parttypmod, n * sizeof(int32));
newkey->parttyplen = (int16 *) palloc(n * sizeof(int16));
memcpy(newkey->parttyplen, fromkey->parttyplen, n * sizeof(int16));
newkey->parttypbyval = (bool *) palloc(n * sizeof(bool));
memcpy(newkey->parttypbyval, fromkey->parttypbyval, n * sizeof(bool));
newkey->parttypalign = (char *) palloc(n * sizeof(bool));
memcpy(newkey->parttypalign, fromkey->parttypalign, n * sizeof(char));
newkey->parttypcoll = (Oid *) palloc(n * sizeof(Oid));
memcpy(newkey->parttypcoll, fromkey->parttypcoll, n * sizeof(Oid));
return newkey;
}
/*
* equalRuleLocks
*
@ -922,6 +1161,58 @@ equalRSDesc(RowSecurityDesc *rsdesc1, RowSecurityDesc *rsdesc2)
return true;
}
/*
* equalPartitionDescs
* Compare two partition descriptors for logical equality
*/
static bool
equalPartitionDescs(PartitionKey key, PartitionDesc partdesc1,
PartitionDesc partdesc2)
{
int i;
if (partdesc1 != NULL)
{
if (partdesc2 == NULL)
return false;
if (partdesc1->nparts != partdesc2->nparts)
return false;
Assert(key != NULL || partdesc1->nparts == 0);
/*
* Same oids? If the partitioning structure did not change, that is,
* no partitions were added or removed to the relation, the oids array
* should still match element-by-element.
*/
for (i = 0; i < partdesc1->nparts; i++)
{
if (partdesc1->oids[i] != partdesc2->oids[i])
return false;
}
/*
* Now compare partition bound collections. The logic to iterate over
* the collections is private to partition.c.
*/
if (partdesc1->boundinfo != NULL)
{
if (partdesc2->boundinfo == NULL)
return false;
if (!partition_bounds_equal(key, partdesc1->boundinfo,
partdesc2->boundinfo))
return false;
}
else if (partdesc2->boundinfo != NULL)
return false;
}
else if (partdesc2 != NULL)
return false;
return true;
}
/*
* RelationBuildDesc
*
@ -1050,6 +1341,20 @@ RelationBuildDesc(Oid targetRelId, bool insertIt)
relation->rd_fkeylist = NIL;
relation->rd_fkeyvalid = false;
/* if a partitioned table, initialize key and partition descriptor info */
if (relation->rd_rel->relkind == RELKIND_PARTITIONED_TABLE)
{
RelationBuildPartitionKey(relation);
RelationBuildPartitionDesc(relation);
}
else
{
relation->rd_partkeycxt = NULL;
relation->rd_partkey = NULL;
relation->rd_partdesc = NULL;
relation->rd_pdcxt = NULL;
}
/*
* if it's an index, initialize index-related information
*/
@ -2042,6 +2347,12 @@ RelationDestroyRelation(Relation relation, bool remember_tupdesc)
MemoryContextDelete(relation->rd_rulescxt);
if (relation->rd_rsdesc)
MemoryContextDelete(relation->rd_rsdesc->rscxt);
if (relation->rd_partkeycxt)
MemoryContextDelete(relation->rd_partkeycxt);
if (relation->rd_pdcxt)
MemoryContextDelete(relation->rd_pdcxt);
if (relation->rd_partcheck)
pfree(relation->rd_partcheck);
if (relation->rd_fdwroutine)
pfree(relation->rd_fdwroutine);
pfree(relation);
@ -2190,11 +2501,12 @@ RelationClearRelation(Relation relation, bool rebuild)
*
* When rebuilding an open relcache entry, we must preserve ref count,
* rd_createSubid/rd_newRelfilenodeSubid, and rd_toastoid state. Also
* attempt to preserve the pg_class entry (rd_rel), tupledesc, and
* rewrite-rule substructures in place, because various places assume
* that these structures won't move while they are working with an
* open relcache entry. (Note: the refcount mechanism for tupledescs
* might someday allow us to remove this hack for the tupledesc.)
* attempt to preserve the pg_class entry (rd_rel), tupledesc,
* rewrite-rule, partition key, and partition descriptor substructures
* in place, because various places assume that these structures won't
* move while they are working with an open relcache entry. (Note:
* the refcount mechanism for tupledescs might someday allow us to
* remove this hack for the tupledesc.)
*
* Note that this process does not touch CurrentResourceOwner; which
* is good because whatever ref counts the entry may have do not
@ -2205,6 +2517,8 @@ RelationClearRelation(Relation relation, bool rebuild)
bool keep_tupdesc;
bool keep_rules;
bool keep_policies;
bool keep_partkey;
bool keep_partdesc;
/* Build temporary entry, but don't link it into hashtable */
newrel = RelationBuildDesc(save_relid, false);
@ -2235,6 +2549,10 @@ RelationClearRelation(Relation relation, bool rebuild)
keep_tupdesc = equalTupleDescs(relation->rd_att, newrel->rd_att);
keep_rules = equalRuleLocks(relation->rd_rules, newrel->rd_rules);
keep_policies = equalRSDesc(relation->rd_rsdesc, newrel->rd_rsdesc);
keep_partkey = (relation->rd_partkey != NULL);
keep_partdesc = equalPartitionDescs(relation->rd_partkey,
relation->rd_partdesc,
newrel->rd_partdesc);
/*
* Perform swapping of the relcache entry contents. Within this
@ -2289,6 +2607,18 @@ RelationClearRelation(Relation relation, bool rebuild)
SWAPFIELD(Oid, rd_toastoid);
/* pgstat_info must be preserved */
SWAPFIELD(struct PgStat_TableStatus *, pgstat_info);
/* partition key must be preserved, if we have one */
if (keep_partkey)
{
SWAPFIELD(PartitionKey, rd_partkey);
SWAPFIELD(MemoryContext, rd_partkeycxt);
}
/* preserve old partdesc if no logical change */
if (keep_partdesc)
{
SWAPFIELD(PartitionDesc, rd_partdesc);
SWAPFIELD(MemoryContext, rd_pdcxt);
}
#undef SWAPFIELD
@ -2983,7 +3313,9 @@ RelationBuildLocalRelation(const char *relname,
/* system relations and non-table objects don't have one */
if (!IsSystemNamespace(relnamespace) &&
(relkind == RELKIND_RELATION || relkind == RELKIND_MATVIEW))
(relkind == RELKIND_RELATION ||
relkind == RELKIND_MATVIEW ||
relkind == RELKIND_PARTITIONED_TABLE))
rel->rd_rel->relreplident = REPLICA_IDENTITY_DEFAULT;
else
rel->rd_rel->relreplident = REPLICA_IDENTITY_NOTHING;
@ -3514,6 +3846,20 @@ RelationCacheInitializePhase3(void)
restart = true;
}
/*
* Reload partition key and descriptor for a partitioned table.
*/
if (relation->rd_rel->relkind == RELKIND_PARTITIONED_TABLE)
{
RelationBuildPartitionKey(relation);
Assert(relation->rd_partkey != NULL);
RelationBuildPartitionDesc(relation);
Assert(relation->rd_partdesc != NULL);
restart = true;
}
/* Release hold on the relation */
RelationDecrementReferenceCount(relation);
@ -4267,6 +4613,8 @@ RelationGetIndexExpressions(Relation relation)
*/
result = (List *) eval_const_expressions(NULL, (Node *) result);
result = (List *) canonicalize_qual((Expr *) result);
/* May as well fix opfuncids too */
fix_opfuncids((Node *) result);
@ -5035,6 +5383,10 @@ load_relcache_init_file(bool shared)
rel->rd_rulescxt = NULL;
rel->trigdesc = NULL;
rel->rd_rsdesc = NULL;
rel->rd_partkeycxt = NULL;
rel->rd_partkey = NULL;
rel->rd_partdesc = NULL;
rel->rd_partcheck = NIL;
rel->rd_indexprs = NIL;
rel->rd_indpred = NIL;
rel->rd_exclops = NULL;

View File

@ -48,6 +48,7 @@
#include "catalog/pg_opclass.h"
#include "catalog/pg_operator.h"
#include "catalog/pg_opfamily.h"
#include "catalog/pg_partitioned_table.h"
#include "catalog/pg_proc.h"
#include "catalog/pg_range.h"
#include "catalog/pg_rewrite.h"
@ -568,6 +569,17 @@ static const struct cachedesc cacheinfo[] = {
},
8
},
{PartitionedRelationId, /* PARTRELID */
PartitionedRelidIndexId,
1,
{
Anum_pg_partitioned_table_partrelid,
0,
0,
0
},
32
},
{ProcedureRelationId, /* PROCNAMEARGSNSP */
ProcedureNameArgsNspIndexId,
3,