mirror of
				https://github.com/postgres/postgres.git
				synced 2025-10-29 22:49:41 +03:00 
			
		
		
		
	Allow left join removals and unique joins on partitioned tables
This allows left join removals and unique joins to work with partitioned tables. The planner just lacked sufficient proofs that a given join would not cause any row duplication. Unique indexes currently serve as that proof, so have get_relation_info() populate the indexlist for partitioned tables too. Author: Arne Roland Reviewed-by: Alvaro Herrera, Zhihong Yu, Amit Langote, David Rowley Discussion: https://postgr.es/m/c3b2408b7a39433b8230bbcd02e9f302@index.de
This commit is contained in:
		| @@ -109,7 +109,9 @@ static void set_baserel_partition_constraint(Relation relation, | ||||
|  * If inhparent is true, all we need to do is set up the attr arrays: | ||||
|  * the RelOptInfo actually represents the appendrel formed by an inheritance | ||||
|  * tree, and so the parent rel's physical size and index information isn't | ||||
|  * important for it. | ||||
|  * important for it, however, for partitioned tables, we do populate the | ||||
|  * indexlist as the planner uses unique indexes as unique proofs for certain | ||||
|  * optimizations. | ||||
|  */ | ||||
| void | ||||
| get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent, | ||||
| @@ -175,10 +177,14 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent, | ||||
|  | ||||
| 	/* | ||||
| 	 * Make list of indexes.  Ignore indexes on system catalogs if told to. | ||||
| 	 * Don't bother with indexes for an inheritance parent, either. | ||||
| 	 * Don't bother with indexes from traditional inheritance parents.  For | ||||
| 	 * partitioned tables, we need a list of at least unique indexes as these | ||||
| 	 * serve as unique proofs for certain planner optimizations.  However, | ||||
| 	 * let's not discriminate here and just record all partitioned indexes | ||||
| 	 * whether they're unique indexes or not. | ||||
| 	 */ | ||||
| 	if (inhparent || | ||||
| 		(IgnoreSystemIndexes && IsSystemRelation(relation))) | ||||
| 	if ((inhparent && relation->rd_rel->relkind != RELKIND_PARTITIONED_TABLE) | ||||
| 		|| (IgnoreSystemIndexes && IsSystemRelation(relation))) | ||||
| 		hasindex = false; | ||||
| 	else | ||||
| 		hasindex = relation->rd_rel->relhasindex; | ||||
| @@ -231,16 +237,6 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent, | ||||
| 				continue; | ||||
| 			} | ||||
|  | ||||
| 			/* | ||||
| 			 * Ignore partitioned indexes, since they are not usable for | ||||
| 			 * queries. | ||||
| 			 */ | ||||
| 			if (indexRelation->rd_rel->relkind == RELKIND_PARTITIONED_INDEX) | ||||
| 			{ | ||||
| 				index_close(indexRelation, NoLock); | ||||
| 				continue; | ||||
| 			} | ||||
|  | ||||
| 			/* | ||||
| 			 * If the index is valid, but cannot yet be used, ignore it; but | ||||
| 			 * mark the plan we are generating as transient. See | ||||
| @@ -285,105 +281,129 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent, | ||||
|  | ||||
| 			info->relam = indexRelation->rd_rel->relam; | ||||
|  | ||||
| 			/* We copy just the fields we need, not all of rd_indam */ | ||||
| 			amroutine = indexRelation->rd_indam; | ||||
| 			info->amcanorderbyop = amroutine->amcanorderbyop; | ||||
| 			info->amoptionalkey = amroutine->amoptionalkey; | ||||
| 			info->amsearcharray = amroutine->amsearcharray; | ||||
| 			info->amsearchnulls = amroutine->amsearchnulls; | ||||
| 			info->amcanparallel = amroutine->amcanparallel; | ||||
| 			info->amhasgettuple = (amroutine->amgettuple != NULL); | ||||
| 			info->amhasgetbitmap = amroutine->amgetbitmap != NULL && | ||||
| 				relation->rd_tableam->scan_bitmap_next_block != NULL; | ||||
| 			info->amcanmarkpos = (amroutine->ammarkpos != NULL && | ||||
| 								  amroutine->amrestrpos != NULL); | ||||
| 			info->amcostestimate = amroutine->amcostestimate; | ||||
| 			Assert(info->amcostestimate != NULL); | ||||
|  | ||||
| 			/* Fetch index opclass options */ | ||||
| 			info->opclassoptions = RelationGetIndexAttOptions(indexRelation, true); | ||||
|  | ||||
| 			/* | ||||
| 			 * Fetch the ordering information for the index, if any. | ||||
| 			 * We don't have an AM for partitioned indexes, so we'll just | ||||
| 			 * NULLify the AM related fields for those. | ||||
| 			 */ | ||||
| 			if (info->relam == BTREE_AM_OID) | ||||
| 			if (indexRelation->rd_rel->relkind != RELKIND_PARTITIONED_INDEX) | ||||
| 			{ | ||||
| 				/* We copy just the fields we need, not all of rd_indam */ | ||||
| 				amroutine = indexRelation->rd_indam; | ||||
| 				info->amcanorderbyop = amroutine->amcanorderbyop; | ||||
| 				info->amoptionalkey = amroutine->amoptionalkey; | ||||
| 				info->amsearcharray = amroutine->amsearcharray; | ||||
| 				info->amsearchnulls = amroutine->amsearchnulls; | ||||
| 				info->amcanparallel = amroutine->amcanparallel; | ||||
| 				info->amhasgettuple = (amroutine->amgettuple != NULL); | ||||
| 				info->amhasgetbitmap = amroutine->amgetbitmap != NULL && | ||||
| 					relation->rd_tableam->scan_bitmap_next_block != NULL; | ||||
| 				info->amcanmarkpos = (amroutine->ammarkpos != NULL && | ||||
| 									  amroutine->amrestrpos != NULL); | ||||
| 				info->amcostestimate = amroutine->amcostestimate; | ||||
| 				Assert(info->amcostestimate != NULL); | ||||
|  | ||||
| 				/* Fetch index opclass options */ | ||||
| 				info->opclassoptions = RelationGetIndexAttOptions(indexRelation, true); | ||||
|  | ||||
| 				/* | ||||
| 				 * If it's a btree index, we can use its opfamily OIDs | ||||
| 				 * directly as the sort ordering opfamily OIDs. | ||||
| 				 * Fetch the ordering information for the index, if any. | ||||
| 				 */ | ||||
| 				Assert(amroutine->amcanorder); | ||||
|  | ||||
| 				info->sortopfamily = info->opfamily; | ||||
| 				info->reverse_sort = (bool *) palloc(sizeof(bool) * nkeycolumns); | ||||
| 				info->nulls_first = (bool *) palloc(sizeof(bool) * nkeycolumns); | ||||
|  | ||||
| 				for (i = 0; i < nkeycolumns; i++) | ||||
| 				if (info->relam == BTREE_AM_OID) | ||||
| 				{ | ||||
| 					int16		opt = indexRelation->rd_indoption[i]; | ||||
| 					/* | ||||
| 					 * If it's a btree index, we can use its opfamily OIDs | ||||
| 					 * directly as the sort ordering opfamily OIDs. | ||||
| 					 */ | ||||
| 					Assert(amroutine->amcanorder); | ||||
|  | ||||
| 					info->reverse_sort[i] = (opt & INDOPTION_DESC) != 0; | ||||
| 					info->nulls_first[i] = (opt & INDOPTION_NULLS_FIRST) != 0; | ||||
| 					info->sortopfamily = info->opfamily; | ||||
| 					info->reverse_sort = (bool *) palloc(sizeof(bool) * nkeycolumns); | ||||
| 					info->nulls_first = (bool *) palloc(sizeof(bool) * nkeycolumns); | ||||
|  | ||||
| 					for (i = 0; i < nkeycolumns; i++) | ||||
| 					{ | ||||
| 						int16		opt = indexRelation->rd_indoption[i]; | ||||
|  | ||||
| 						info->reverse_sort[i] = (opt & INDOPTION_DESC) != 0; | ||||
| 						info->nulls_first[i] = (opt & INDOPTION_NULLS_FIRST) != 0; | ||||
| 					} | ||||
| 				} | ||||
| 			} | ||||
| 			else if (amroutine->amcanorder) | ||||
| 			{ | ||||
| 				/* | ||||
| 				 * Otherwise, identify the corresponding btree opfamilies by | ||||
| 				 * trying to map this index's "<" operators into btree.  Since | ||||
| 				 * "<" uniquely defines the behavior of a sort order, this is | ||||
| 				 * a sufficient test. | ||||
| 				 * | ||||
| 				 * XXX This method is rather slow and also requires the | ||||
| 				 * undesirable assumption that the other index AM numbers its | ||||
| 				 * strategies the same as btree.  It'd be better to have a way | ||||
| 				 * to explicitly declare the corresponding btree opfamily for | ||||
| 				 * each opfamily of the other index type.  But given the lack | ||||
| 				 * of current or foreseeable amcanorder index types, it's not | ||||
| 				 * worth expending more effort on now. | ||||
| 				 */ | ||||
| 				info->sortopfamily = (Oid *) palloc(sizeof(Oid) * nkeycolumns); | ||||
| 				info->reverse_sort = (bool *) palloc(sizeof(bool) * nkeycolumns); | ||||
| 				info->nulls_first = (bool *) palloc(sizeof(bool) * nkeycolumns); | ||||
|  | ||||
| 				for (i = 0; i < nkeycolumns; i++) | ||||
| 				else if (amroutine->amcanorder) | ||||
| 				{ | ||||
| 					int16		opt = indexRelation->rd_indoption[i]; | ||||
| 					Oid			ltopr; | ||||
| 					Oid			btopfamily; | ||||
| 					Oid			btopcintype; | ||||
| 					int16		btstrategy; | ||||
| 					/* | ||||
| 					 * Otherwise, identify the corresponding btree opfamilies | ||||
| 					 * by trying to map this index's "<" operators into btree. | ||||
| 					 * Since "<" uniquely defines the behavior of a sort | ||||
| 					 * order, this is a sufficient test. | ||||
| 					 * | ||||
| 					 * XXX This method is rather slow and also requires the | ||||
| 					 * undesirable assumption that the other index AM numbers | ||||
| 					 * its strategies the same as btree.  It'd be better to | ||||
| 					 * have a way to explicitly declare the corresponding | ||||
| 					 * btree opfamily for each opfamily of the other index | ||||
| 					 * type.  But given the lack of current or foreseeable | ||||
| 					 * amcanorder index types, it's not worth expending more | ||||
| 					 * effort on now. | ||||
| 					 */ | ||||
| 					info->sortopfamily = (Oid *) palloc(sizeof(Oid) * nkeycolumns); | ||||
| 					info->reverse_sort = (bool *) palloc(sizeof(bool) * nkeycolumns); | ||||
| 					info->nulls_first = (bool *) palloc(sizeof(bool) * nkeycolumns); | ||||
|  | ||||
| 					info->reverse_sort[i] = (opt & INDOPTION_DESC) != 0; | ||||
| 					info->nulls_first[i] = (opt & INDOPTION_NULLS_FIRST) != 0; | ||||
| 					for (i = 0; i < nkeycolumns; i++) | ||||
| 					{ | ||||
| 						int16		opt = indexRelation->rd_indoption[i]; | ||||
| 						Oid			ltopr; | ||||
| 						Oid			btopfamily; | ||||
| 						Oid			btopcintype; | ||||
| 						int16		btstrategy; | ||||
|  | ||||
| 					ltopr = get_opfamily_member(info->opfamily[i], | ||||
| 												info->opcintype[i], | ||||
| 												info->opcintype[i], | ||||
| 												BTLessStrategyNumber); | ||||
| 					if (OidIsValid(ltopr) && | ||||
| 						get_ordering_op_properties(ltopr, | ||||
| 												   &btopfamily, | ||||
| 												   &btopcintype, | ||||
| 												   &btstrategy) && | ||||
| 						btopcintype == info->opcintype[i] && | ||||
| 						btstrategy == BTLessStrategyNumber) | ||||
| 					{ | ||||
| 						/* Successful mapping */ | ||||
| 						info->sortopfamily[i] = btopfamily; | ||||
| 					} | ||||
| 					else | ||||
| 					{ | ||||
| 						/* Fail ... quietly treat index as unordered */ | ||||
| 						info->sortopfamily = NULL; | ||||
| 						info->reverse_sort = NULL; | ||||
| 						info->nulls_first = NULL; | ||||
| 						break; | ||||
| 						info->reverse_sort[i] = (opt & INDOPTION_DESC) != 0; | ||||
| 						info->nulls_first[i] = (opt & INDOPTION_NULLS_FIRST) != 0; | ||||
|  | ||||
| 						ltopr = get_opfamily_member(info->opfamily[i], | ||||
| 													info->opcintype[i], | ||||
| 													info->opcintype[i], | ||||
| 													BTLessStrategyNumber); | ||||
| 						if (OidIsValid(ltopr) && | ||||
| 							get_ordering_op_properties(ltopr, | ||||
| 													   &btopfamily, | ||||
| 													   &btopcintype, | ||||
| 													   &btstrategy) && | ||||
| 							btopcintype == info->opcintype[i] && | ||||
| 							btstrategy == BTLessStrategyNumber) | ||||
| 						{ | ||||
| 							/* Successful mapping */ | ||||
| 							info->sortopfamily[i] = btopfamily; | ||||
| 						} | ||||
| 						else | ||||
| 						{ | ||||
| 							/* Fail ... quietly treat index as unordered */ | ||||
| 							info->sortopfamily = NULL; | ||||
| 							info->reverse_sort = NULL; | ||||
| 							info->nulls_first = NULL; | ||||
| 							break; | ||||
| 						} | ||||
| 					} | ||||
| 				} | ||||
| 				else | ||||
| 				{ | ||||
| 					info->sortopfamily = NULL; | ||||
| 					info->reverse_sort = NULL; | ||||
| 					info->nulls_first = NULL; | ||||
| 				} | ||||
| 			} | ||||
| 			else | ||||
| 			{ | ||||
| 				info->amcanorderbyop = false; | ||||
| 				info->amoptionalkey = false; | ||||
| 				info->amsearcharray = false; | ||||
| 				info->amsearchnulls = false; | ||||
| 				info->amcanparallel = false; | ||||
| 				info->amhasgettuple = false; | ||||
| 				info->amhasgetbitmap = false; | ||||
| 				info->amcanmarkpos = false; | ||||
| 				info->amcostestimate = NULL; | ||||
|  | ||||
| 				info->sortopfamily = NULL; | ||||
| 				info->reverse_sort = NULL; | ||||
| 				info->nulls_first = NULL; | ||||
| @@ -416,31 +436,45 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent, | ||||
| 			 * the number-of-tuples estimate to equal the parent table; if it | ||||
| 			 * is partial then we have to use the same methods as we would for | ||||
| 			 * a table, except we can be sure that the index is not larger | ||||
| 			 * than the table. | ||||
| 			 * than the table.  We must ignore partitioned indexes here as as | ||||
| 			 * there are not physical indexes. | ||||
| 			 */ | ||||
| 			if (info->indpred == NIL) | ||||
| 			if (indexRelation->rd_rel->relkind != RELKIND_PARTITIONED_INDEX) | ||||
| 			{ | ||||
| 				info->pages = RelationGetNumberOfBlocks(indexRelation); | ||||
| 				info->tuples = rel->tuples; | ||||
| 			} | ||||
| 			else | ||||
| 			{ | ||||
| 				double		allvisfrac; /* dummy */ | ||||
|  | ||||
| 				estimate_rel_size(indexRelation, NULL, | ||||
| 								  &info->pages, &info->tuples, &allvisfrac); | ||||
| 				if (info->tuples > rel->tuples) | ||||
| 				if (info->indpred == NIL) | ||||
| 				{ | ||||
| 					info->pages = RelationGetNumberOfBlocks(indexRelation); | ||||
| 					info->tuples = rel->tuples; | ||||
| 			} | ||||
| 				} | ||||
| 				else | ||||
| 				{ | ||||
| 					double		allvisfrac; /* dummy */ | ||||
|  | ||||
| 			if (info->relam == BTREE_AM_OID) | ||||
| 			{ | ||||
| 				/* For btrees, get tree height while we have the index open */ | ||||
| 				info->tree_height = _bt_getrootheight(indexRelation); | ||||
| 					estimate_rel_size(indexRelation, NULL, | ||||
| 									  &info->pages, &info->tuples, &allvisfrac); | ||||
| 					if (info->tuples > rel->tuples) | ||||
| 						info->tuples = rel->tuples; | ||||
| 				} | ||||
|  | ||||
| 				if (info->relam == BTREE_AM_OID) | ||||
| 				{ | ||||
| 					/* | ||||
| 					 * For btrees, get tree height while we have the index | ||||
| 					 * open | ||||
| 					 */ | ||||
| 					info->tree_height = _bt_getrootheight(indexRelation); | ||||
| 				} | ||||
| 				else | ||||
| 				{ | ||||
| 					/* For other index types, just set it to "unknown" for now */ | ||||
| 					info->tree_height = -1; | ||||
| 				} | ||||
| 			} | ||||
| 			else | ||||
| 			{ | ||||
| 				/* For other index types, just set it to "unknown" for now */ | ||||
| 				/* Zero these out for partitioned indexes */ | ||||
| 				info->pages = 0; | ||||
| 				info->tuples = 0.0; | ||||
| 				info->tree_height = -1; | ||||
| 			} | ||||
|  | ||||
|   | ||||
| @@ -5994,6 +5994,10 @@ get_actual_variable_range(PlannerInfo *root, VariableStatData *vardata, | ||||
| 	rte = root->simple_rte_array[rel->relid]; | ||||
| 	Assert(rte->rtekind == RTE_RELATION); | ||||
|  | ||||
| 	/* ignore partitioned tables.  Any indexes here are not real indexes */ | ||||
| 	if (rte->relkind == RELKIND_PARTITIONED_TABLE) | ||||
| 		return false; | ||||
|  | ||||
| 	/* Search through the indexes to see if any match our problem */ | ||||
| 	foreach(lc, rel->indexlist) | ||||
| 	{ | ||||
|   | ||||
| @@ -653,7 +653,7 @@ typedef struct PartitionSchemeData *PartitionScheme; | ||||
|  *		lateral_referencers - relids of rels that reference this one laterally | ||||
|  *				(includes both direct and indirect lateral references) | ||||
|  *		indexlist - list of IndexOptInfo nodes for relation's indexes | ||||
|  *					(always NIL if it's not a table) | ||||
|  *					(always NIL if it's not a table or partitioned table) | ||||
|  *		pages - number of disk pages in relation (zero if not a table) | ||||
|  *		tuples - number of tuples in relation (not considering restrictions) | ||||
|  *		allvisfrac - fraction of disk pages that are marked all-visible | ||||
| @@ -1097,11 +1097,11 @@ struct IndexOptInfo | ||||
| 	Oid		   *opfamily pg_node_attr(array_size(nkeycolumns)); | ||||
| 	/* OIDs of opclass declared input data types */ | ||||
| 	Oid		   *opcintype pg_node_attr(array_size(nkeycolumns)); | ||||
| 	/* OIDs of btree opfamilies, if orderable */ | ||||
| 	/* OIDs of btree opfamilies, if orderable.  NULL if partitioned index */ | ||||
| 	Oid		   *sortopfamily pg_node_attr(array_size(nkeycolumns)); | ||||
| 	/* is sort order descending? */ | ||||
| 	/* is sort order descending? or NULL if partitioned index */ | ||||
| 	bool	   *reverse_sort pg_node_attr(array_size(nkeycolumns)); | ||||
| 	/* do NULLs come first in the sort order? */ | ||||
| 	/* do NULLs come first in the sort order? or NULL if partitioned index */ | ||||
| 	bool	   *nulls_first pg_node_attr(array_size(nkeycolumns)); | ||||
| 	/* opclass-specific options for columns */ | ||||
| 	bytea	  **opclassoptions pg_node_attr(read_write_ignore); | ||||
| @@ -1139,7 +1139,7 @@ struct IndexOptInfo | ||||
|  | ||||
| 	/* | ||||
| 	 * Remaining fields are copied from the index AM's API struct | ||||
| 	 * (IndexAmRoutine). | ||||
| 	 * (IndexAmRoutine).  These fields are not set for partitioned indexes. | ||||
| 	 */ | ||||
| 	bool		amcanorderbyop; | ||||
| 	bool		amoptionalkey; | ||||
|   | ||||
| @@ -4860,6 +4860,16 @@ select 1 from (select a.id FROM a left join b on a.b_id = b.id) q, | ||||
|          Filter: (a.id = i) | ||||
| (4 rows) | ||||
|  | ||||
| CREATE TEMP TABLE parted_b (id int PRIMARY KEY) partition by range(id); | ||||
| CREATE TEMP TABLE parted_b1 partition of parted_b for values from (0) to (10); | ||||
| -- test join removals on a partitioned table | ||||
| explain (costs off) | ||||
| select a.* from a left join parted_b pb on a.b_id = pb.id; | ||||
|   QUERY PLAN    | ||||
| --------------- | ||||
|  Seq Scan on a | ||||
| (1 row) | ||||
|  | ||||
| rollback; | ||||
| create temp table parent (k int primary key, pd int); | ||||
| create temp table child (k int unique, cd int); | ||||
|   | ||||
| @@ -4874,7 +4874,7 @@ ANALYZE fract_t; | ||||
| SET max_parallel_workers_per_gather = 0; | ||||
| SET enable_partitionwise_join = on; | ||||
| EXPLAIN (COSTS OFF) | ||||
| SELECT * FROM fract_t x LEFT JOIN fract_t y USING (id) ORDER BY id ASC LIMIT 10; | ||||
| SELECT x.id, y.id FROM fract_t x LEFT JOIN fract_t y USING (id) ORDER BY x.id ASC LIMIT 10; | ||||
|                               QUERY PLAN                                | ||||
| ----------------------------------------------------------------------- | ||||
|  Limit | ||||
| @@ -4891,7 +4891,7 @@ SELECT * FROM fract_t x LEFT JOIN fract_t y USING (id) ORDER BY id ASC LIMIT 10; | ||||
| (11 rows) | ||||
|  | ||||
| EXPLAIN (COSTS OFF) | ||||
| SELECT * FROM fract_t x LEFT JOIN fract_t y USING (id) ORDER BY id DESC LIMIT 10; | ||||
| SELECT x.id, y.id FROM fract_t x LEFT JOIN fract_t y USING (id) ORDER BY x.id DESC LIMIT 10; | ||||
|                                    QUERY PLAN                                    | ||||
| -------------------------------------------------------------------------------- | ||||
|  Limit | ||||
|   | ||||
| @@ -1709,6 +1709,13 @@ explain (costs off) | ||||
| select 1 from (select a.id FROM a left join b on a.b_id = b.id) q, | ||||
| 			  lateral generate_series(1, q.id) gs(i) where q.id = gs.i; | ||||
|  | ||||
| CREATE TEMP TABLE parted_b (id int PRIMARY KEY) partition by range(id); | ||||
| CREATE TEMP TABLE parted_b1 partition of parted_b for values from (0) to (10); | ||||
|  | ||||
| -- test join removals on a partitioned table | ||||
| explain (costs off) | ||||
| select a.* from a left join parted_b pb on a.b_id = pb.id; | ||||
|  | ||||
| rollback; | ||||
|  | ||||
| create temp table parent (k int primary key, pd int); | ||||
|   | ||||
| @@ -1157,10 +1157,10 @@ SET max_parallel_workers_per_gather = 0; | ||||
| SET enable_partitionwise_join = on; | ||||
|  | ||||
| EXPLAIN (COSTS OFF) | ||||
| SELECT * FROM fract_t x LEFT JOIN fract_t y USING (id) ORDER BY id ASC LIMIT 10; | ||||
| SELECT x.id, y.id FROM fract_t x LEFT JOIN fract_t y USING (id) ORDER BY x.id ASC LIMIT 10; | ||||
|  | ||||
| EXPLAIN (COSTS OFF) | ||||
| SELECT * FROM fract_t x LEFT JOIN fract_t y USING (id) ORDER BY id DESC LIMIT 10; | ||||
| SELECT x.id, y.id FROM fract_t x LEFT JOIN fract_t y USING (id) ORDER BY x.id DESC LIMIT 10; | ||||
|  | ||||
| -- cleanup | ||||
| DROP TABLE fract_t; | ||||
|   | ||||
		Reference in New Issue
	
	Block a user