1
0
mirror of https://github.com/postgres/postgres.git synced 2025-10-31 10:30:33 +03:00

Faster partition pruning

Add a new module backend/partitioning/partprune.c, implementing a more
sophisticated algorithm for partition pruning.  The new module uses each
partition's "boundinfo" for pruning instead of constraint exclusion,
based on an idea proposed by Robert Haas of a "pruning program": a list
of steps generated from the query quals which are run iteratively to
obtain a list of partitions that must be scanned in order to satisfy
those quals.

At present, this targets planner-time partition pruning, but there exist
further patches to apply partition pruning at execution time as well.

This commit also moves some definitions from include/catalog/partition.h
to a new file include/partitioning/partbounds.h, in an attempt to
rationalize partitioning related code.

Authors: Amit Langote, David Rowley, Dilip Kumar
Reviewers: Robert Haas, Kyotaro Horiguchi, Ashutosh Bapat, Jesper Pedersen.
Discussion: https://postgr.es/m/098b9c71-1915-1a2a-8d52-1a7a50ce79e8@lab.ntt.co.jp
This commit is contained in:
Alvaro Herrera
2018-04-06 16:23:04 -03:00
parent 11523e860f
commit 9fdb675fc5
27 changed files with 3993 additions and 415 deletions

View File

@@ -53,6 +53,6 @@
*/
/* yyyymmddN */
#define CATALOG_VERSION_NO 201804052
#define CATALOG_VERSION_NO 201804061
#endif

View File

@@ -26,7 +26,7 @@
* PartitionBoundInfo encapsulates a set of partition bounds. It is usually
* associated with partitioned tables as part of its partition descriptor.
*
* The internal structure is opaque outside partition.c.
* The internal structure appears in partbounds.h.
*/
typedef struct PartitionBoundInfoData *PartitionBoundInfo;
@@ -70,7 +70,6 @@ extern void check_default_allows_bound(Relation parent, Relation defaultRel,
PartitionBoundSpec *new_spec);
extern List *get_proposed_default_constraint(List *new_part_constaints);
/* For tuple routing */
extern int get_partition_for_tuple(Relation relation, Datum *values,
bool *isnull);

View File

@@ -53,6 +53,9 @@ typedef FormData_pg_opfamily *Form_pg_opfamily;
#define Anum_pg_opfamily_opfnamespace 3
#define Anum_pg_opfamily_opfowner 4
#define IsBooleanOpfamily(opfamily) \
((opfamily) == BOOL_BTREE_FAM_OID || (opfamily) == BOOL_HASH_FAM_OID)
/* ----------------
* initial contents of pg_opfamily
* ----------------

View File

@@ -193,6 +193,9 @@ typedef enum NodeTag
T_FromExpr,
T_OnConflictExpr,
T_IntoClause,
T_PartitionPruneStep,
T_PartitionPruneStepOp,
T_PartitionPruneStepCombine,
/*
* TAGS FOR EXPRESSION STATE NODES (execnodes.h)
@@ -262,7 +265,6 @@ typedef enum NodeTag
T_PlaceHolderVar,
T_SpecialJoinInfo,
T_AppendRelInfo,
T_PartitionedChildRelInfo,
T_PlaceHolderInfo,
T_MinMaxAggInfo,
T_PlannerParamItem,

View File

@@ -18,6 +18,7 @@
#define PRIMNODES_H
#include "access/attnum.h"
#include "access/stratnum.h"
#include "nodes/bitmapset.h"
#include "nodes/pg_list.h"
@@ -1506,4 +1507,78 @@ typedef struct OnConflictExpr
List *exclRelTlist; /* tlist of the EXCLUDED pseudo relation */
} OnConflictExpr;
/*
* Node types to represent a partition pruning step.
*/
/*
* The base Node type. step_id is the global identifier of a given step
* within a given pruning context.
*/
typedef struct PartitionPruneStep
{
NodeTag type;
int step_id;
} PartitionPruneStep;
/*----------
* PartitionPruneStepOp - Information to prune using a set of mutually AND'd
* OpExpr clauses
*
* This contains information extracted from up to partnatts OpExpr clauses,
* where partnatts is the number of partition key columns. 'opstrategy' is the
* strategy of the operator in the clause matched to the last partition key.
* 'exprs' contains expressions which comprise the lookup key to be passed to
* the partition bound search function. 'cmpfns' contains the OIDs of
* comparison function used to compare aforementioned expressions with
* partition bounds. Both 'exprs' and 'cmpfns' contain the same number of
* items up to partnatts items.
*
* Once we find the offset of a partition bound using the lookup key, we
* determine which partitions to include in the result based on the value of
* 'opstrategy'. For example, if it were equality, we'd return just the
* partition that would contain that key or a set of partitions if the key
* didn't consist of all partitioning columns. For non-equality strategies,
* we'd need to include other partitions as appropriate.
*
* 'nullkeys' is the set containing the offset of the partition keys (0 to
* partnatts - 1) that were matched to an IS NULL clause. This is only
* considered for hash partitioning as we need to pass which keys are null
* to the hash partition bound search function. It is never possible to
* have an expression be present in 'exprs' for a given partition key and
* the corresponding bit set in 'nullkeys'.
*----------
*/
typedef struct PartitionPruneStepOp
{
PartitionPruneStep step;
StrategyNumber opstrategy;
List *exprs;
List *cmpfns;
Bitmapset *nullkeys;
} PartitionPruneStepOp;
/*----------
* PartitionPruneStepCombine - Information to prune using a BoolExpr clause
*
* For BoolExpr clauses, we combine the set of partitions determined for each
* of its argument clauses.
*----------
*/
typedef enum PartitionPruneCombineOp
{
PARTPRUNE_COMBINE_UNION,
PARTPRUNE_COMBINE_INTERSECT
} PartitionPruneCombineOp;
typedef struct PartitionPruneStepCombine
{
PartitionPruneStep step;
PartitionPruneCombineOp combineOp;
List *source_stepids;
} PartitionPruneStepCombine;
#endif /* PRIMNODES_H */

View File

@@ -15,6 +15,7 @@
#define RELATION_H
#include "access/sdir.h"
#include "fmgr.h"
#include "lib/stringinfo.h"
#include "nodes/params.h"
#include "nodes/parsenodes.h"
@@ -253,8 +254,6 @@ typedef struct PlannerInfo
List *append_rel_list; /* list of AppendRelInfos */
List *pcinfo_list; /* list of PartitionedChildRelInfos */
List *rowMarks; /* list of PlanRowMarks */
List *placeholder_list; /* list of PlaceHolderInfos */
@@ -319,6 +318,9 @@ typedef struct PlannerInfo
/* optional private data for join_search_hook, e.g., GEQO */
void *join_search_private;
/* Does this query modify any partition key columns? */
bool partColsUpdated;
} PlannerInfo;
@@ -356,6 +358,9 @@ typedef struct PartitionSchemeData
/* Cached information about partition key data types. */
int16 *parttyplen;
bool *parttypbyval;
/* Cached information about partition comparison functions. */
FmgrInfo *partsupfunc;
} PartitionSchemeData;
typedef struct PartitionSchemeData *PartitionScheme;
@@ -528,11 +533,15 @@ typedef struct PartitionSchemeData *PartitionScheme;
*
* If the relation is partitioned, these fields will be set:
*
* part_scheme - Partitioning scheme of the relation
* boundinfo - Partition bounds
* nparts - Number of partitions
* part_rels - RelOptInfos for each partition
* partexprs, nullable_partexprs - Partition key expressions
* part_scheme - Partitioning scheme of the relation
* nparts - Number of partitions
* boundinfo - Partition bounds
* partition_qual - Partition constraint if not the root
* part_rels - RelOptInfos for each partition
* partexprs, nullable_partexprs - Partition key expressions
* partitioned_child_rels - RT indexes of unpruned partitions of
* relation that are partitioned tables
* themselves
*
* Note: A base relation always has only one set of partition keys, but a join
* relation may have as many sets of partition keys as the number of relations
@@ -663,10 +672,12 @@ typedef struct RelOptInfo
PartitionScheme part_scheme; /* Partitioning scheme. */
int nparts; /* number of partitions */
struct PartitionBoundInfoData *boundinfo; /* Partition bounds */
List *partition_qual; /* partition constraint */
struct RelOptInfo **part_rels; /* Array of RelOptInfos of partitions,
* stored in the same order of bounds */
List **partexprs; /* Non-nullable partition key expressions. */
List **nullable_partexprs; /* Nullable partition key expressions. */
List *partitioned_child_rels; /* List of RT indexes. */
} RelOptInfo;
/*
@@ -1686,7 +1697,7 @@ typedef struct ModifyTablePath
List *partitioned_rels;
bool partColsUpdated; /* some part key in hierarchy updated */
List *resultRelations; /* integer list of RT indexes */
Index mergeTargetRelation;/* RT index of merge target relation */
Index mergeTargetRelation; /* RT index of merge target relation */
List *subpaths; /* Path(s) producing source data */
List *subroots; /* per-target-table PlannerInfos */
List *withCheckOptionLists; /* per-target-table WCO lists */
@@ -2121,27 +2132,6 @@ typedef struct AppendRelInfo
Oid parent_reloid; /* OID of parent relation */
} AppendRelInfo;
/*
* For a partitioned table, this maps its RT index to the list of RT indexes
* of the partitioned child tables in the partition tree. We need to
* separately store this information, because we do not create AppendRelInfos
* for the partitioned child tables of a parent table, since AppendRelInfos
* contain information that is unnecessary for the partitioned child tables.
* The child_rels list must contain at least one element, because the parent
* partitioned table is itself counted as a child.
*
* These structs are kept in the PlannerInfo node's pcinfo_list.
*/
typedef struct PartitionedChildRelInfo
{
NodeTag type;
Index parent_relid;
List *child_rels;
bool part_cols_updated; /* is the partition key of any of
* the partitioned tables updated? */
} PartitionedChildRelInfo;
/*
* For each distinct placeholder expression generated during planning, we
* store a PlaceHolderInfo node in the PlannerInfo node's placeholder_list.

View File

@@ -59,9 +59,4 @@ extern Expr *preprocess_phv_expression(PlannerInfo *root, Expr *expr);
extern bool plan_cluster_use_sort(Oid tableOid, Oid indexOid);
extern int plan_create_index_workers(Oid tableOid, Oid indexOid);
extern List *get_partitioned_child_rels(PlannerInfo *root, Index rti,
bool *part_cols_updated);
extern List *get_partitioned_child_rels_for_join(PlannerInfo *root,
Relids join_relids);
#endif /* PLANNER_H */

View File

@@ -0,0 +1,124 @@
/*-------------------------------------------------------------------------
*
* partbounds.h
*
* Copyright (c) 2007-2018, PostgreSQL Global Development Group
*
* src/include/partitioning/partbounds.h
*
*-------------------------------------------------------------------------
*/
#ifndef PARTBOUNDS_H
#define PARTBOUNDS_H
#include "catalog/partition.h"
/*
* PartitionBoundInfoData encapsulates a set of partition bounds. It is
* usually associated with partitioned tables as part of its partition
* descriptor, but may also be used to represent a virtual partitioned
* table such as a partitioned joinrel within the planner.
*
* A list partition datum that is known to be NULL is never put into the
* datums array. Instead, it is tracked using the null_index field.
*
* In the case of range partitioning, ndatums will typically be far less than
* 2 * nparts, because a partition's upper bound and the next partition's lower
* bound are the same in most common cases, and we only store one of them (the
* upper bound). In case of hash partitioning, ndatums will be same as the
* number of partitions.
*
* For range and list partitioned tables, datums is an array of datum-tuples
* with key->partnatts datums each. For hash partitioned tables, it is an array
* of datum-tuples with 2 datums, modulus and remainder, corresponding to a
* given partition.
*
* The datums in datums array are arranged in increasing order as defined by
* functions qsort_partition_rbound_cmp(), qsort_partition_list_value_cmp() and
* qsort_partition_hbound_cmp() for range, list and hash partitioned tables
* respectively. For range and list partitions this simply means that the
* datums in the datums array are arranged in increasing order as defined by
* the partition key's operator classes and collations.
*
* In the case of list partitioning, the indexes array stores one entry for
* every datum, which is the index of the partition that accepts a given datum.
* In case of range partitioning, it stores one entry per distinct range
* datum, which is the index of the partition for which a given datum
* is an upper bound. In the case of hash partitioning, the number of the
* entries in the indexes array is same as the greatest modulus amongst all
* partitions. For a given partition key datum-tuple, the index of the
* partition which would accept that datum-tuple would be given by the entry
* pointed by remainder produced when hash value of the datum-tuple is divided
* by the greatest modulus.
*/
typedef struct PartitionBoundInfoData
{
char strategy; /* hash, list or range? */
int ndatums; /* Length of the datums following array */
Datum **datums;
PartitionRangeDatumKind **kind; /* The kind of each range bound datum;
* NULL for hash and list partitioned
* tables */
int *indexes; /* Partition indexes */
int null_index; /* Index of the null-accepting partition; -1
* if there isn't one */
int default_index; /* Index of the default partition; -1 if there
* isn't one */
} PartitionBoundInfoData;
#define partition_bound_accepts_nulls(bi) ((bi)->null_index != -1)
#define partition_bound_has_default(bi) ((bi)->default_index != -1)
/*
* When qsort'ing partition bounds after reading from the catalog, each bound
* is represented with one of the following structs.
*/
/* One bound of a hash partition */
typedef struct PartitionHashBound
{
int modulus;
int remainder;
int index;
} PartitionHashBound;
/* One value coming from some (index'th) list partition */
typedef struct PartitionListValue
{
int index;
Datum value;
} PartitionListValue;
/* One bound of a range partition */
typedef struct PartitionRangeBound
{
int index;
Datum *datums; /* range bound datums */
PartitionRangeDatumKind *kind; /* the kind of each datum */
bool lower; /* this is the lower (vs upper) bound */
} PartitionRangeBound;
extern int get_hash_partition_greatest_modulus(PartitionBoundInfo b);
extern int partition_list_bsearch(FmgrInfo *partsupfunc, Oid *partcollation,
PartitionBoundInfo boundinfo,
Datum value, bool *is_equal);
extern int partition_range_bsearch(int partnatts, FmgrInfo *partsupfunc,
Oid *partcollation,
PartitionBoundInfo boundinfo,
PartitionRangeBound *probe, bool *is_equal);
extern int partition_range_datum_bsearch(FmgrInfo *partsupfunc,
Oid *partcollation,
PartitionBoundInfo boundinfo,
int nvalues, Datum *values, bool *is_equal);
extern int partition_hash_bsearch(PartitionBoundInfo boundinfo,
int modulus, int remainder);
extern uint64 compute_hash_value(int partnatts, FmgrInfo *partsupfunc,
Datum *values, bool *isnull);
extern int32 partition_rbound_datum_cmp(FmgrInfo *partsupfunc,
Oid *partcollation,
Datum *rb_datums, PartitionRangeDatumKind *rb_kind,
Datum *tuple_datums, int n_tuple_datums);
#endif /* PARTBOUNDS_H */

View File

@@ -0,0 +1,49 @@
/*-------------------------------------------------------------------------
*
* partprune.h
* prototypes for partprune.c
*
*
* Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* src/include/partitioning/partprune.h
*
*-------------------------------------------------------------------------
*/
#ifndef PARTPRUNE_H
#define PARTPRUNE_H
#include "catalog/partition.h"
#include "nodes/relation.h"
/*
* PartitionPruneContext
*
* Information about a partitioned table needed to perform partition pruning.
*/
typedef struct PartitionPruneContext
{
/* Partition key information */
char strategy;
int partnatts;
Oid *partopfamily;
Oid *partopcintype;
Oid *partcollation;
FmgrInfo *partsupfunc;
/* Number of partitions */
int nparts;
/* Partition boundary info */
PartitionBoundInfo boundinfo;
} PartitionPruneContext;
extern Relids prune_append_rel_partitions(RelOptInfo *rel);
extern Bitmapset *get_matching_partitions(PartitionPruneContext *context,
List *pruning_steps);
extern List *gen_partprune_steps(RelOptInfo *rel, List *clauses,
bool *contradictory);
#endif /* PARTPRUNE_H */