mirror of
				https://github.com/postgres/postgres.git
				synced 2025-10-31 10:30:33 +03:00 
			
		
		
		
	Faster partition pruning
Add a new module backend/partitioning/partprune.c, implementing a more sophisticated algorithm for partition pruning. The new module uses each partition's "boundinfo" for pruning instead of constraint exclusion, based on an idea proposed by Robert Haas of a "pruning program": a list of steps generated from the query quals which are run iteratively to obtain a list of partitions that must be scanned in order to satisfy those quals. At present, this targets planner-time partition pruning, but there exist further patches to apply partition pruning at execution time as well. This commit also moves some definitions from include/catalog/partition.h to a new file include/partitioning/partbounds.h, in an attempt to rationalize partitioning related code. Authors: Amit Langote, David Rowley, Dilip Kumar Reviewers: Robert Haas, Kyotaro Horiguchi, Ashutosh Bapat, Jesper Pedersen. Discussion: https://postgr.es/m/098b9c71-1915-1a2a-8d52-1a7a50ce79e8@lab.ntt.co.jp
This commit is contained in:
		| @@ -53,6 +53,6 @@ | ||||
|  */ | ||||
|  | ||||
| /*							yyyymmddN */ | ||||
| #define CATALOG_VERSION_NO	201804052 | ||||
| #define CATALOG_VERSION_NO	201804061 | ||||
|  | ||||
| #endif | ||||
|   | ||||
| @@ -26,7 +26,7 @@ | ||||
|  * PartitionBoundInfo encapsulates a set of partition bounds.  It is usually | ||||
|  * associated with partitioned tables as part of its partition descriptor. | ||||
|  * | ||||
|  * The internal structure is opaque outside partition.c. | ||||
|  * The internal structure appears in partbounds.h. | ||||
|  */ | ||||
| typedef struct PartitionBoundInfoData *PartitionBoundInfo; | ||||
|  | ||||
| @@ -70,7 +70,6 @@ extern void check_default_allows_bound(Relation parent, Relation defaultRel, | ||||
| 						   PartitionBoundSpec *new_spec); | ||||
| extern List *get_proposed_default_constraint(List *new_part_constaints); | ||||
|  | ||||
| /* For tuple routing */ | ||||
| extern int get_partition_for_tuple(Relation relation, Datum *values, | ||||
| 						bool *isnull); | ||||
|  | ||||
|   | ||||
| @@ -53,6 +53,9 @@ typedef FormData_pg_opfamily *Form_pg_opfamily; | ||||
| #define Anum_pg_opfamily_opfnamespace	3 | ||||
| #define Anum_pg_opfamily_opfowner		4 | ||||
|  | ||||
| #define IsBooleanOpfamily(opfamily) \ | ||||
| 	((opfamily) == BOOL_BTREE_FAM_OID || (opfamily) == BOOL_HASH_FAM_OID) | ||||
|  | ||||
| /* ---------------- | ||||
|  *		initial contents of pg_opfamily | ||||
|  * ---------------- | ||||
|   | ||||
| @@ -193,6 +193,9 @@ typedef enum NodeTag | ||||
| 	T_FromExpr, | ||||
| 	T_OnConflictExpr, | ||||
| 	T_IntoClause, | ||||
| 	T_PartitionPruneStep, | ||||
| 	T_PartitionPruneStepOp, | ||||
| 	T_PartitionPruneStepCombine, | ||||
|  | ||||
| 	/* | ||||
| 	 * TAGS FOR EXPRESSION STATE NODES (execnodes.h) | ||||
| @@ -262,7 +265,6 @@ typedef enum NodeTag | ||||
| 	T_PlaceHolderVar, | ||||
| 	T_SpecialJoinInfo, | ||||
| 	T_AppendRelInfo, | ||||
| 	T_PartitionedChildRelInfo, | ||||
| 	T_PlaceHolderInfo, | ||||
| 	T_MinMaxAggInfo, | ||||
| 	T_PlannerParamItem, | ||||
|   | ||||
| @@ -18,6 +18,7 @@ | ||||
| #define PRIMNODES_H | ||||
|  | ||||
| #include "access/attnum.h" | ||||
| #include "access/stratnum.h" | ||||
| #include "nodes/bitmapset.h" | ||||
| #include "nodes/pg_list.h" | ||||
|  | ||||
| @@ -1506,4 +1507,78 @@ typedef struct OnConflictExpr | ||||
| 	List	   *exclRelTlist;	/* tlist of the EXCLUDED pseudo relation */ | ||||
| } OnConflictExpr; | ||||
|  | ||||
|  | ||||
| /* | ||||
|  * Node types to represent a partition pruning step. | ||||
|  */ | ||||
|  | ||||
| /* | ||||
|  * The base Node type.  step_id is the global identifier of a given step | ||||
|  * within a given pruning context. | ||||
|  */ | ||||
| typedef struct PartitionPruneStep | ||||
| { | ||||
| 	NodeTag		type; | ||||
| 	int			step_id; | ||||
| } PartitionPruneStep; | ||||
|  | ||||
| /*---------- | ||||
|  * PartitionPruneStepOp - Information to prune using a set of mutually AND'd | ||||
|  *							OpExpr clauses | ||||
|  * | ||||
|  * This contains information extracted from up to partnatts OpExpr clauses, | ||||
|  * where partnatts is the number of partition key columns.  'opstrategy' is the | ||||
|  * strategy of the operator in the clause matched to the last partition key. | ||||
|  * 'exprs' contains expressions which comprise the lookup key to be passed to | ||||
|  * the partition bound search function.  'cmpfns' contains the OIDs of | ||||
|  * comparison function used to compare aforementioned expressions with | ||||
|  * partition bounds.  Both 'exprs' and 'cmpfns' contain the same number of | ||||
|  * items up to partnatts items. | ||||
|  * | ||||
|  * Once we find the offset of a partition bound using the lookup key, we | ||||
|  * determine which partitions to include in the result based on the value of | ||||
|  * 'opstrategy'.  For example, if it were equality, we'd return just the | ||||
|  * partition that would contain that key or a set of partitions if the key | ||||
|  * didn't consist of all partitioning columns.  For non-equality strategies, | ||||
|  * we'd need to include other partitions as appropriate. | ||||
|  * | ||||
|  * 'nullkeys' is the set containing the offset of the partition keys (0 to | ||||
|  * partnatts - 1) that were matched to an IS NULL clause.  This is only | ||||
|  * considered for hash partitioning as we need to pass which keys are null | ||||
|  * to the hash partition bound search function.  It is never possible to | ||||
|  * have an expression be present in 'exprs' for a given partition key and | ||||
|  * the corresponding bit set in 'nullkeys'. | ||||
|  *---------- | ||||
|  */ | ||||
| typedef struct PartitionPruneStepOp | ||||
| { | ||||
| 	PartitionPruneStep step; | ||||
|  | ||||
| 	StrategyNumber opstrategy; | ||||
| 	List	   *exprs; | ||||
| 	List	   *cmpfns; | ||||
| 	Bitmapset  *nullkeys; | ||||
| } PartitionPruneStepOp; | ||||
|  | ||||
| /*---------- | ||||
|  * PartitionPruneStepCombine - Information to prune using a BoolExpr clause | ||||
|  * | ||||
|  * For BoolExpr clauses, we combine the set of partitions determined for each | ||||
|  * of its argument clauses. | ||||
|  *---------- | ||||
|  */ | ||||
| typedef enum PartitionPruneCombineOp | ||||
| { | ||||
| 	PARTPRUNE_COMBINE_UNION, | ||||
| 	PARTPRUNE_COMBINE_INTERSECT | ||||
| } PartitionPruneCombineOp; | ||||
|  | ||||
| typedef struct PartitionPruneStepCombine | ||||
| { | ||||
| 	PartitionPruneStep step; | ||||
|  | ||||
| 	PartitionPruneCombineOp combineOp; | ||||
| 	List	   *source_stepids; | ||||
| } PartitionPruneStepCombine; | ||||
|  | ||||
| #endif							/* PRIMNODES_H */ | ||||
|   | ||||
| @@ -15,6 +15,7 @@ | ||||
| #define RELATION_H | ||||
|  | ||||
| #include "access/sdir.h" | ||||
| #include "fmgr.h" | ||||
| #include "lib/stringinfo.h" | ||||
| #include "nodes/params.h" | ||||
| #include "nodes/parsenodes.h" | ||||
| @@ -253,8 +254,6 @@ typedef struct PlannerInfo | ||||
|  | ||||
| 	List	   *append_rel_list;	/* list of AppendRelInfos */ | ||||
|  | ||||
| 	List	   *pcinfo_list;	/* list of PartitionedChildRelInfos */ | ||||
|  | ||||
| 	List	   *rowMarks;		/* list of PlanRowMarks */ | ||||
|  | ||||
| 	List	   *placeholder_list;	/* list of PlaceHolderInfos */ | ||||
| @@ -319,6 +318,9 @@ typedef struct PlannerInfo | ||||
|  | ||||
| 	/* optional private data for join_search_hook, e.g., GEQO */ | ||||
| 	void	   *join_search_private; | ||||
|  | ||||
| 	/* Does this query modify any partition key columns? */ | ||||
| 	bool		partColsUpdated; | ||||
| } PlannerInfo; | ||||
|  | ||||
|  | ||||
| @@ -356,6 +358,9 @@ typedef struct PartitionSchemeData | ||||
| 	/* Cached information about partition key data types. */ | ||||
| 	int16	   *parttyplen; | ||||
| 	bool	   *parttypbyval; | ||||
|  | ||||
| 	/* Cached information about partition comparison functions. */ | ||||
| 	FmgrInfo   *partsupfunc; | ||||
| }			PartitionSchemeData; | ||||
|  | ||||
| typedef struct PartitionSchemeData *PartitionScheme; | ||||
| @@ -528,11 +533,15 @@ typedef struct PartitionSchemeData *PartitionScheme; | ||||
|  * | ||||
|  * If the relation is partitioned, these fields will be set: | ||||
|  * | ||||
|  * 		part_scheme - Partitioning scheme of the relation | ||||
|  * 		boundinfo - Partition bounds | ||||
|  * 		nparts - Number of partitions | ||||
|  * 		part_rels - RelOptInfos for each partition | ||||
|  * 		partexprs, nullable_partexprs - Partition key expressions | ||||
|  *		part_scheme - Partitioning scheme of the relation | ||||
|  *		nparts - Number of partitions | ||||
|  *		boundinfo - Partition bounds | ||||
|  *		partition_qual - Partition constraint if not the root | ||||
|  *		part_rels - RelOptInfos for each partition | ||||
|  *		partexprs, nullable_partexprs - Partition key expressions | ||||
|  *		partitioned_child_rels - RT indexes of unpruned partitions of | ||||
|  *								 relation that are partitioned tables | ||||
|  *								 themselves | ||||
|  * | ||||
|  * Note: A base relation always has only one set of partition keys, but a join | ||||
|  * relation may have as many sets of partition keys as the number of relations | ||||
| @@ -663,10 +672,12 @@ typedef struct RelOptInfo | ||||
| 	PartitionScheme part_scheme;	/* Partitioning scheme. */ | ||||
| 	int			nparts;			/* number of partitions */ | ||||
| 	struct PartitionBoundInfoData *boundinfo;	/* Partition bounds */ | ||||
| 	List	   *partition_qual; /* partition constraint */ | ||||
| 	struct RelOptInfo **part_rels;	/* Array of RelOptInfos of partitions, | ||||
| 									 * stored in the same order of bounds */ | ||||
| 	List	  **partexprs;		/* Non-nullable partition key expressions. */ | ||||
| 	List	  **nullable_partexprs; /* Nullable partition key expressions. */ | ||||
| 	List	   *partitioned_child_rels; /* List of RT indexes. */ | ||||
| } RelOptInfo; | ||||
|  | ||||
| /* | ||||
| @@ -1686,7 +1697,7 @@ typedef struct ModifyTablePath | ||||
| 	List	   *partitioned_rels; | ||||
| 	bool		partColsUpdated;	/* some part key in hierarchy updated */ | ||||
| 	List	   *resultRelations;	/* integer list of RT indexes */ | ||||
| 	Index	  	mergeTargetRelation;/* RT index of merge target relation */ | ||||
| 	Index		mergeTargetRelation;	/* RT index of merge target relation */ | ||||
| 	List	   *subpaths;		/* Path(s) producing source data */ | ||||
| 	List	   *subroots;		/* per-target-table PlannerInfos */ | ||||
| 	List	   *withCheckOptionLists;	/* per-target-table WCO lists */ | ||||
| @@ -2121,27 +2132,6 @@ typedef struct AppendRelInfo | ||||
| 	Oid			parent_reloid;	/* OID of parent relation */ | ||||
| } AppendRelInfo; | ||||
|  | ||||
| /* | ||||
|  * For a partitioned table, this maps its RT index to the list of RT indexes | ||||
|  * of the partitioned child tables in the partition tree.  We need to | ||||
|  * separately store this information, because we do not create AppendRelInfos | ||||
|  * for the partitioned child tables of a parent table, since AppendRelInfos | ||||
|  * contain information that is unnecessary for the partitioned child tables. | ||||
|  * The child_rels list must contain at least one element, because the parent | ||||
|  * partitioned table is itself counted as a child. | ||||
|  * | ||||
|  * These structs are kept in the PlannerInfo node's pcinfo_list. | ||||
|  */ | ||||
| typedef struct PartitionedChildRelInfo | ||||
| { | ||||
| 	NodeTag		type; | ||||
|  | ||||
| 	Index		parent_relid; | ||||
| 	List	   *child_rels; | ||||
| 	bool		part_cols_updated;	/* is the partition key of any of | ||||
| 									 * the partitioned tables updated? */ | ||||
| } PartitionedChildRelInfo; | ||||
|  | ||||
| /* | ||||
|  * For each distinct placeholder expression generated during planning, we | ||||
|  * store a PlaceHolderInfo node in the PlannerInfo node's placeholder_list. | ||||
|   | ||||
| @@ -59,9 +59,4 @@ extern Expr *preprocess_phv_expression(PlannerInfo *root, Expr *expr); | ||||
| extern bool plan_cluster_use_sort(Oid tableOid, Oid indexOid); | ||||
| extern int	plan_create_index_workers(Oid tableOid, Oid indexOid); | ||||
|  | ||||
| extern List *get_partitioned_child_rels(PlannerInfo *root, Index rti, | ||||
| 						   bool *part_cols_updated); | ||||
| extern List *get_partitioned_child_rels_for_join(PlannerInfo *root, | ||||
| 									Relids join_relids); | ||||
|  | ||||
| #endif							/* PLANNER_H */ | ||||
|   | ||||
							
								
								
									
										124
									
								
								src/include/partitioning/partbounds.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										124
									
								
								src/include/partitioning/partbounds.h
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,124 @@ | ||||
| /*------------------------------------------------------------------------- | ||||
|  * | ||||
|  * partbounds.h | ||||
|  * | ||||
|  * Copyright (c) 2007-2018, PostgreSQL Global Development Group | ||||
|  * | ||||
|  * src/include/partitioning/partbounds.h | ||||
|  * | ||||
|  *------------------------------------------------------------------------- | ||||
|  */ | ||||
| #ifndef PARTBOUNDS_H | ||||
| #define PARTBOUNDS_H | ||||
|  | ||||
| #include "catalog/partition.h" | ||||
|  | ||||
|  | ||||
| /* | ||||
|  * PartitionBoundInfoData encapsulates a set of partition bounds. It is | ||||
|  * usually associated with partitioned tables as part of its partition | ||||
|  * descriptor, but may also be used to represent a virtual partitioned | ||||
|  * table such as a partitioned joinrel within the planner. | ||||
|  * | ||||
|  * A list partition datum that is known to be NULL is never put into the | ||||
|  * datums array. Instead, it is tracked using the null_index field. | ||||
|  * | ||||
|  * In the case of range partitioning, ndatums will typically be far less than | ||||
|  * 2 * nparts, because a partition's upper bound and the next partition's lower | ||||
|  * bound are the same in most common cases, and we only store one of them (the | ||||
|  * upper bound).  In case of hash partitioning, ndatums will be same as the | ||||
|  * number of partitions. | ||||
|  * | ||||
|  * For range and list partitioned tables, datums is an array of datum-tuples | ||||
|  * with key->partnatts datums each.  For hash partitioned tables, it is an array | ||||
|  * of datum-tuples with 2 datums, modulus and remainder, corresponding to a | ||||
|  * given partition. | ||||
|  * | ||||
|  * The datums in datums array are arranged in increasing order as defined by | ||||
|  * functions qsort_partition_rbound_cmp(), qsort_partition_list_value_cmp() and | ||||
|  * qsort_partition_hbound_cmp() for range, list and hash partitioned tables | ||||
|  * respectively. For range and list partitions this simply means that the | ||||
|  * datums in the datums array are arranged in increasing order as defined by | ||||
|  * the partition key's operator classes and collations. | ||||
|  * | ||||
|  * In the case of list partitioning, the indexes array stores one entry for | ||||
|  * every datum, which is the index of the partition that accepts a given datum. | ||||
|  * In case of range partitioning, it stores one entry per distinct range | ||||
|  * datum, which is the index of the partition for which a given datum | ||||
|  * is an upper bound.  In the case of hash partitioning, the number of the | ||||
|  * entries in the indexes array is same as the greatest modulus amongst all | ||||
|  * partitions.  For a given partition key datum-tuple, the index of the | ||||
|  * partition which would accept that datum-tuple would be given by the entry | ||||
|  * pointed by remainder produced when hash value of the datum-tuple is divided | ||||
|  * by the greatest modulus. | ||||
|  */ | ||||
|  | ||||
| typedef struct PartitionBoundInfoData | ||||
| { | ||||
| 	char		strategy;		/* hash, list or range? */ | ||||
| 	int			ndatums;		/* Length of the datums following array */ | ||||
| 	Datum	  **datums; | ||||
| 	PartitionRangeDatumKind **kind; /* The kind of each range bound datum; | ||||
| 									 * NULL for hash and list partitioned | ||||
| 									 * tables */ | ||||
| 	int		   *indexes;		/* Partition indexes */ | ||||
| 	int			null_index;		/* Index of the null-accepting partition; -1 | ||||
| 								 * if there isn't one */ | ||||
| 	int			default_index;	/* Index of the default partition; -1 if there | ||||
| 								 * isn't one */ | ||||
| } PartitionBoundInfoData; | ||||
|  | ||||
| #define partition_bound_accepts_nulls(bi) ((bi)->null_index != -1) | ||||
| #define partition_bound_has_default(bi) ((bi)->default_index != -1) | ||||
|  | ||||
| /* | ||||
|  * When qsort'ing partition bounds after reading from the catalog, each bound | ||||
|  * is represented with one of the following structs. | ||||
|  */ | ||||
|  | ||||
| /* One bound of a hash partition */ | ||||
| typedef struct PartitionHashBound | ||||
| { | ||||
| 	int			modulus; | ||||
| 	int			remainder; | ||||
| 	int			index; | ||||
| } PartitionHashBound; | ||||
|  | ||||
| /* One value coming from some (index'th) list partition */ | ||||
| typedef struct PartitionListValue | ||||
| { | ||||
| 	int			index; | ||||
| 	Datum		value; | ||||
| } PartitionListValue; | ||||
|  | ||||
| /* One bound of a range partition */ | ||||
| typedef struct PartitionRangeBound | ||||
| { | ||||
| 	int			index; | ||||
| 	Datum	   *datums;			/* range bound datums */ | ||||
| 	PartitionRangeDatumKind *kind;	/* the kind of each datum */ | ||||
| 	bool		lower;			/* this is the lower (vs upper) bound */ | ||||
| } PartitionRangeBound; | ||||
|  | ||||
| extern int	get_hash_partition_greatest_modulus(PartitionBoundInfo b); | ||||
| extern int partition_list_bsearch(FmgrInfo *partsupfunc, Oid *partcollation, | ||||
| 					   PartitionBoundInfo boundinfo, | ||||
| 					   Datum value, bool *is_equal); | ||||
| extern int partition_range_bsearch(int partnatts, FmgrInfo *partsupfunc, | ||||
| 						Oid *partcollation, | ||||
| 						PartitionBoundInfo boundinfo, | ||||
| 						PartitionRangeBound *probe, bool *is_equal); | ||||
| extern int partition_range_datum_bsearch(FmgrInfo *partsupfunc, | ||||
| 							  Oid *partcollation, | ||||
| 							  PartitionBoundInfo boundinfo, | ||||
| 							  int nvalues, Datum *values, bool *is_equal); | ||||
| extern int partition_hash_bsearch(PartitionBoundInfo boundinfo, | ||||
| 					   int modulus, int remainder); | ||||
| extern uint64 compute_hash_value(int partnatts, FmgrInfo *partsupfunc, | ||||
| 				   Datum *values, bool *isnull); | ||||
| extern int32 partition_rbound_datum_cmp(FmgrInfo *partsupfunc, | ||||
| 						   Oid *partcollation, | ||||
| 						   Datum *rb_datums, PartitionRangeDatumKind *rb_kind, | ||||
| 						   Datum *tuple_datums, int n_tuple_datums); | ||||
|  | ||||
| #endif							/* PARTBOUNDS_H */ | ||||
							
								
								
									
										49
									
								
								src/include/partitioning/partprune.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										49
									
								
								src/include/partitioning/partprune.h
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,49 @@ | ||||
| /*------------------------------------------------------------------------- | ||||
|  * | ||||
|  * partprune.h | ||||
|  *	  prototypes for partprune.c | ||||
|  * | ||||
|  * | ||||
|  * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group | ||||
|  * Portions Copyright (c) 1994, Regents of the University of California | ||||
|  * | ||||
|  * src/include/partitioning/partprune.h | ||||
|  * | ||||
|  *------------------------------------------------------------------------- | ||||
|  */ | ||||
| #ifndef PARTPRUNE_H | ||||
| #define PARTPRUNE_H | ||||
|  | ||||
| #include "catalog/partition.h" | ||||
| #include "nodes/relation.h" | ||||
|  | ||||
| /* | ||||
|  * PartitionPruneContext | ||||
|  * | ||||
|  * Information about a partitioned table needed to perform partition pruning. | ||||
|  */ | ||||
| typedef struct PartitionPruneContext | ||||
| { | ||||
| 	/* Partition key information */ | ||||
| 	char		strategy; | ||||
| 	int			partnatts; | ||||
| 	Oid		   *partopfamily; | ||||
| 	Oid		   *partopcintype; | ||||
| 	Oid		   *partcollation; | ||||
| 	FmgrInfo   *partsupfunc; | ||||
|  | ||||
| 	/* Number of partitions */ | ||||
| 	int			nparts; | ||||
|  | ||||
| 	/* Partition boundary info */ | ||||
| 	PartitionBoundInfo boundinfo; | ||||
| } PartitionPruneContext; | ||||
|  | ||||
|  | ||||
| extern Relids prune_append_rel_partitions(RelOptInfo *rel); | ||||
| extern Bitmapset *get_matching_partitions(PartitionPruneContext *context, | ||||
| 						List *pruning_steps); | ||||
| extern List *gen_partprune_steps(RelOptInfo *rel, List *clauses, | ||||
| 					bool *contradictory); | ||||
|  | ||||
| #endif							/* PARTPRUNE_H */ | ||||
		Reference in New Issue
	
	Block a user