1
0
mirror of https://github.com/postgres/postgres.git synced 2025-07-02 09:02:37 +03:00

Allow partitionwise joins in more cases.

Previously, the partitionwise join technique only allowed partitionwise
join when input partitioned tables had exactly the same partition
bounds.  This commit extends the technique to some cases when the tables
have different partition bounds, by using an advanced partition-matching
algorithm introduced by this commit.  For both the input partitioned
tables, the algorithm checks whether every partition of one input
partitioned table only matches one partition of the other input
partitioned table at most, and vice versa.  In such a case the join
between the tables can be broken down into joins between the matching
partitions, so the algorithm produces the pairs of the matching
partitions, plus the partition bounds for the join relation, to allow
partitionwise join for computing the join.  Currently, the algorithm
works for list-partitioned and range-partitioned tables, but not
hash-partitioned tables.  See comments in partition_bounds_merge().

Ashutosh Bapat and Etsuro Fujita, most of regression tests by Rajkumar
Raghuwanshi, some of the tests by Mark Dilger and Amul Sul, reviewed by
Dmitry Dolgov and Amul Sul, with additional review at various points by
Ashutosh Bapat, Mark Dilger, Robert Haas, Antonin Houska, Amit Langote,
Justin Pryzby, and Tomas Vondra

Discussion: https://postgr.es/m/CAFjFpRdjQvaUEV5DJX3TW6pU5eq54NCkadtxHX2JiJG_GvbrCA@mail.gmail.com
This commit is contained in:
Etsuro Fujita
2020-04-08 10:25:00 +09:00
parent 41a194f491
commit c8434d64ce
11 changed files with 5390 additions and 81 deletions

View File

@ -2309,6 +2309,8 @@ _outRelOptInfo(StringInfo str, const RelOptInfo *node)
WRITE_BOOL_FIELD(has_eclass_joins);
WRITE_BOOL_FIELD(consider_partitionwise_join);
WRITE_BITMAPSET_FIELD(top_parent_relids);
WRITE_BOOL_FIELD(partbounds_merged);
WRITE_BITMAPSET_FIELD(all_partrels);
WRITE_NODE_FIELD(partitioned_child_rels);
}

View File

@ -1106,6 +1106,33 @@ into joins between their partitions is called partitionwise join. We will use
term "partitioned relation" for either a partitioned table or a join between
compatibly partitioned tables.
Even if the joining relations don't have exactly the same partition bounds,
partitionwise join can still be applied by using an advanced
partition-matching algorithm. For both the joining relations, the algorithm
checks wether every partition of one joining relation only matches one
partition of the other joining relation at most. In such a case the join
between the joining relations can be broken down into joins between the
matching partitions. The join relation can then be considerd partitioned.
The algorithm produces the pairs of the matching partitions, plus the
partition bounds for the join relation, to allow partitionwise join for
computing the join. The algorithm is implemented in partition_bounds_merge().
For an N-way join relation considered partitioned this way, not every pair of
joining relations can use partitionwise join. For example:
(A leftjoin B on (Pab)) innerjoin C on (Pac)
where A, B, and C are partitioned tables, and A has an extra partition
compared to B and C. When considering partitionwise join for the join {A B},
the extra partition of A doesn't have a matching partition on the nullable
side, which is the case that the current implementation of partitionwise join
can't handle. So {A B} is not considered partitioned, and the pair of {A B}
and C considered for the 3-way join can't use partitionwise join. On the
other hand, the pair of {A C} and B can use partitionwise join because {A C}
is considered partitioned by eliminating the extra partition (see identity 1
on outer join reordering). Whether an N-way join can use partitionwise join
is determined based on the first pair of joining relations that are both
partitioned and can use partitionwise join.
The partitioning properties of a partitioned relation are stored in its
RelOptInfo. The information about data types of partition keys are stored in
PartitionSchemeData structure. The planner maintains a list of canonical

View File

@ -45,6 +45,13 @@ static void try_partitionwise_join(PlannerInfo *root, RelOptInfo *rel1,
static SpecialJoinInfo *build_child_join_sjinfo(PlannerInfo *root,
SpecialJoinInfo *parent_sjinfo,
Relids left_relids, Relids right_relids);
static void compute_partition_bounds(PlannerInfo *root, RelOptInfo *rel1,
RelOptInfo *rel2, RelOptInfo *joinrel,
SpecialJoinInfo *parent_sjinfo,
List **parts1, List **parts2);
static void get_matching_part_pairs(PlannerInfo *root, RelOptInfo *joinrel,
RelOptInfo *rel1, RelOptInfo *rel2,
List **parts1, List **parts2);
/*
@ -1354,25 +1361,29 @@ try_partitionwise_join(PlannerInfo *root, RelOptInfo *rel1, RelOptInfo *rel2,
{
bool rel1_is_simple = IS_SIMPLE_REL(rel1);
bool rel2_is_simple = IS_SIMPLE_REL(rel2);
int nparts;
List *parts1 = NIL;
List *parts2 = NIL;
ListCell *lcr1 = NULL;
ListCell *lcr2 = NULL;
int cnt_parts;
/* Guard against stack overflow due to overly deep partition hierarchy. */
check_stack_depth();
/* Nothing to do, if the join relation is not partitioned. */
if (!IS_PARTITIONED_REL(joinrel))
if (joinrel->part_scheme == NULL || joinrel->nparts == 0)
return;
/* The join relation should have consider_partitionwise_join set. */
Assert(joinrel->consider_partitionwise_join);
/*
* Since this join relation is partitioned, all the base relations
* participating in this join must be partitioned and so are all the
* intermediate join relations.
* We can not perform partitionwise join if either of the joining relations
* is not partitioned.
*/
Assert(IS_PARTITIONED_REL(rel1) && IS_PARTITIONED_REL(rel2));
if (!IS_PARTITIONED_REL(rel1) || !IS_PARTITIONED_REL(rel2))
return;
Assert(REL_HAS_ALL_PART_PROPS(rel1) && REL_HAS_ALL_PART_PROPS(rel2));
/* The joining relations should have consider_partitionwise_join set. */
@ -1386,35 +1397,28 @@ try_partitionwise_join(PlannerInfo *root, RelOptInfo *rel1, RelOptInfo *rel2,
Assert(joinrel->part_scheme == rel1->part_scheme &&
joinrel->part_scheme == rel2->part_scheme);
/*
* Since we allow partitionwise join only when the partition bounds of the
* joining relations exactly match, the partition bounds of the join
* should match those of the joining relations.
*/
Assert(partition_bounds_equal(joinrel->part_scheme->partnatts,
joinrel->part_scheme->parttyplen,
joinrel->part_scheme->parttypbyval,
joinrel->boundinfo, rel1->boundinfo));
Assert(partition_bounds_equal(joinrel->part_scheme->partnatts,
joinrel->part_scheme->parttyplen,
joinrel->part_scheme->parttypbyval,
joinrel->boundinfo, rel2->boundinfo));
Assert(!(joinrel->partbounds_merged && (joinrel->nparts <= 0)));
nparts = joinrel->nparts;
compute_partition_bounds(root, rel1, rel2, joinrel, parent_sjinfo,
&parts1, &parts2);
if (joinrel->partbounds_merged)
{
lcr1 = list_head(parts1);
lcr2 = list_head(parts2);
}
/*
* Create child-join relations for this partitioned join, if those don't
* exist. Add paths to child-joins for a pair of child relations
* corresponding to the given pair of parent relations.
*/
for (cnt_parts = 0; cnt_parts < nparts; cnt_parts++)
for (cnt_parts = 0; cnt_parts < joinrel->nparts; cnt_parts++)
{
RelOptInfo *child_rel1 = rel1->part_rels[cnt_parts];
RelOptInfo *child_rel2 = rel2->part_rels[cnt_parts];
bool rel1_empty = (child_rel1 == NULL ||
IS_DUMMY_REL(child_rel1));
bool rel2_empty = (child_rel2 == NULL ||
IS_DUMMY_REL(child_rel2));
RelOptInfo *child_rel1;
RelOptInfo *child_rel2;
bool rel1_empty;
bool rel2_empty;
SpecialJoinInfo *child_sjinfo;
List *child_restrictlist;
RelOptInfo *child_joinrel;
@ -1422,6 +1426,22 @@ try_partitionwise_join(PlannerInfo *root, RelOptInfo *rel1, RelOptInfo *rel2,
AppendRelInfo **appinfos;
int nappinfos;
if (joinrel->partbounds_merged)
{
child_rel1 = lfirst_node(RelOptInfo, lcr1);
child_rel2 = lfirst_node(RelOptInfo, lcr2);
lcr1 = lnext(parts1, lcr1);
lcr2 = lnext(parts2, lcr2);
}
else
{
child_rel1 = rel1->part_rels[cnt_parts];
child_rel2 = rel2->part_rels[cnt_parts];
}
rel1_empty = (child_rel1 == NULL || IS_DUMMY_REL(child_rel1));
rel2_empty = (child_rel2 == NULL || IS_DUMMY_REL(child_rel2));
/*
* Check for cases where we can prove that this segment of the join
* returns no rows, due to one or both inputs being empty (including
@ -1519,6 +1539,8 @@ try_partitionwise_join(PlannerInfo *root, RelOptInfo *rel1, RelOptInfo *rel2,
child_sjinfo,
child_sjinfo->jointype);
joinrel->part_rels[cnt_parts] = child_joinrel;
joinrel->all_partrels = bms_add_members(joinrel->all_partrels,
child_joinrel->relids);
}
Assert(bms_equal(child_joinrel->relids, child_joinrelids));
@ -1570,3 +1592,190 @@ build_child_join_sjinfo(PlannerInfo *root, SpecialJoinInfo *parent_sjinfo,
return sjinfo;
}
/*
* compute_partition_bounds
* Compute the partition bounds for a join rel from those for inputs
*/
static void
compute_partition_bounds(PlannerInfo *root, RelOptInfo *rel1,
RelOptInfo *rel2, RelOptInfo *joinrel,
SpecialJoinInfo *parent_sjinfo,
List **parts1, List **parts2)
{
/*
* If we don't have the partition bounds for the join rel yet, try to
* compute those along with pairs of partitions to be joined.
*/
if (joinrel->nparts == -1)
{
PartitionScheme part_scheme = joinrel->part_scheme;
PartitionBoundInfo boundinfo = NULL;
int nparts = 0;
Assert(joinrel->boundinfo == NULL);
Assert(joinrel->part_rels == NULL);
/*
* See if the partition bounds for inputs are exactly the same, in
* which case we don't need to work hard: the join rel have the same
* partition bounds as inputs, and the partitions with the same
* cardinal positions form the pairs.
*
* Note: even in cases where one or both inputs have merged bounds,
* it would be possible for both the bounds to be exactly the same, but
* it seems unlikely to be worth the cycles to check.
*/
if (!rel1->partbounds_merged &&
!rel2->partbounds_merged &&
rel1->nparts == rel2->nparts &&
partition_bounds_equal(part_scheme->partnatts,
part_scheme->parttyplen,
part_scheme->parttypbyval,
rel1->boundinfo, rel2->boundinfo))
{
boundinfo = rel1->boundinfo;
nparts = rel1->nparts;
}
else
{
/* Try merging the partition bounds for inputs. */
boundinfo = partition_bounds_merge(part_scheme->partnatts,
part_scheme->partsupfunc,
part_scheme->partcollation,
rel1, rel2,
parent_sjinfo->jointype,
parts1, parts2);
if (boundinfo == NULL)
{
joinrel->nparts = 0;
return;
}
nparts = list_length(*parts1);
joinrel->partbounds_merged = true;
}
Assert(nparts > 0);
joinrel->boundinfo = boundinfo;
joinrel->nparts = nparts;
joinrel->part_rels =
(RelOptInfo **) palloc0(sizeof(RelOptInfo *) * nparts);
}
else
{
Assert(joinrel->nparts > 0);
Assert(joinrel->boundinfo);
Assert(joinrel->part_rels);
/*
* If the join rel's partbounds_merged flag is true, it means inputs
* are not guaranteed to have the same partition bounds, therefore we
* can't assume that the partitions at the same cardinal positions form
* the pairs; let get_matching_part_pairs() generate the pairs.
* Otherwise, nothing to do since we can assume that.
*/
if (joinrel->partbounds_merged)
{
get_matching_part_pairs(root, joinrel, rel1, rel2,
parts1, parts2);
Assert(list_length(*parts1) == joinrel->nparts);
Assert(list_length(*parts2) == joinrel->nparts);
}
}
}
/*
* get_matching_part_pairs
* Generate pairs of partitions to be joined from inputs
*/
static void
get_matching_part_pairs(PlannerInfo *root, RelOptInfo *joinrel,
RelOptInfo *rel1, RelOptInfo *rel2,
List **parts1, List **parts2)
{
bool rel1_is_simple = IS_SIMPLE_REL(rel1);
bool rel2_is_simple = IS_SIMPLE_REL(rel2);
int cnt_parts;
*parts1 = NIL;
*parts2 = NIL;
for (cnt_parts = 0; cnt_parts < joinrel->nparts; cnt_parts++)
{
RelOptInfo *child_joinrel = joinrel->part_rels[cnt_parts];
RelOptInfo *child_rel1;
RelOptInfo *child_rel2;
Relids child_relids1;
Relids child_relids2;
/*
* If this segment of the join is empty, it means that this segment
* was ignored when previously creating child-join paths for it in
* try_partitionwise_join() as it would not contribute to the join
* result, due to one or both inputs being empty; add NULL to each of
* the given lists so that this segment will be ignored again in that
* function.
*/
if (!child_joinrel)
{
*parts1 = lappend(*parts1, NULL);
*parts2 = lappend(*parts2, NULL);
continue;
}
/*
* Get a relids set of partition(s) involved in this join segment that
* are from the rel1 side.
*/
child_relids1 = bms_intersect(child_joinrel->relids,
rel1->all_partrels);
Assert(bms_num_members(child_relids1) == bms_num_members(rel1->relids));
/*
* Get a child rel for rel1 with the relids. Note that we should have
* the child rel even if rel1 is a join rel, because in that case the
* partitions specified in the relids would have matching/overlapping
* boundaries, so the specified partitions should be considered as ones
* to be joined when planning partitionwise joins of rel1, meaning that
* the child rel would have been built by the time we get here.
*/
if (rel1_is_simple)
{
int varno = bms_singleton_member(child_relids1);
child_rel1 = find_base_rel(root, varno);
}
else
child_rel1 = find_join_rel(root, child_relids1);
Assert(child_rel1);
/*
* Get a relids set of partition(s) involved in this join segment that
* are from the rel2 side.
*/
child_relids2 = bms_intersect(child_joinrel->relids,
rel2->all_partrels);
Assert(bms_num_members(child_relids2) == bms_num_members(rel2->relids));
/*
* Get a child rel for rel2 with the relids. See above comments.
*/
if (rel2_is_simple)
{
int varno = bms_singleton_member(child_relids2);
child_rel2 = find_base_rel(root, varno);
}
else
child_rel2 = find_join_rel(root, child_relids2);
Assert(child_rel2);
/*
* The join of rel1 and rel2 is legal, so is the join of the child
* rels obtained above; add them to the given lists as a join pair
* producing this join segment.
*/
*parts1 = lappend(*parts1, child_rel1);
*parts2 = lappend(*parts2, child_rel2);
}
}

View File

@ -376,6 +376,8 @@ expand_partitioned_rtentry(PlannerInfo *root, RelOptInfo *relinfo,
/* Create the otherrel RelOptInfo too. */
childrelinfo = build_simple_rel(root, childRTindex, relinfo);
relinfo->part_rels[i] = childrelinfo;
relinfo->all_partrels = bms_add_members(relinfo->all_partrels,
childrelinfo->relids);
/* If this child is itself partitioned, recurse */
if (childrel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE)

View File

@ -249,10 +249,12 @@ build_simple_rel(PlannerInfo *root, int relid, RelOptInfo *parent)
rel->has_eclass_joins = false;
rel->consider_partitionwise_join = false; /* might get changed later */
rel->part_scheme = NULL;
rel->nparts = 0;
rel->nparts = -1;
rel->boundinfo = NULL;
rel->partbounds_merged = false;
rel->partition_qual = NIL;
rel->part_rels = NULL;
rel->all_partrels = NULL;
rel->partexprs = NULL;
rel->nullable_partexprs = NULL;
rel->partitioned_child_rels = NIL;
@ -662,10 +664,12 @@ build_join_rel(PlannerInfo *root,
joinrel->consider_partitionwise_join = false; /* might get changed later */
joinrel->top_parent_relids = NULL;
joinrel->part_scheme = NULL;
joinrel->nparts = 0;
joinrel->nparts = -1;
joinrel->boundinfo = NULL;
joinrel->partbounds_merged = false;
joinrel->partition_qual = NIL;
joinrel->part_rels = NULL;
joinrel->all_partrels = NULL;
joinrel->partexprs = NULL;
joinrel->nullable_partexprs = NULL;
joinrel->partitioned_child_rels = NIL;
@ -838,10 +842,12 @@ build_child_join_rel(PlannerInfo *root, RelOptInfo *outer_rel,
joinrel->consider_partitionwise_join = false; /* might get changed later */
joinrel->top_parent_relids = NULL;
joinrel->part_scheme = NULL;
joinrel->nparts = 0;
joinrel->nparts = -1;
joinrel->boundinfo = NULL;
joinrel->partbounds_merged = false;
joinrel->partition_qual = NIL;
joinrel->part_rels = NULL;
joinrel->all_partrels = NULL;
joinrel->partexprs = NULL;
joinrel->nullable_partexprs = NULL;
joinrel->partitioned_child_rels = NIL;
@ -1645,7 +1651,7 @@ build_joinrel_partition_info(RelOptInfo *joinrel, RelOptInfo *outer_rel,
* of the way the query planner deduces implied equalities and reorders
* the joins. Please see optimizer/README for details.
*/
if (!IS_PARTITIONED_REL(outer_rel) || !IS_PARTITIONED_REL(inner_rel) ||
if (outer_rel->part_scheme == NULL || inner_rel->part_scheme == NULL ||
!outer_rel->consider_partitionwise_join ||
!inner_rel->consider_partitionwise_join ||
outer_rel->part_scheme != inner_rel->part_scheme ||
@ -1658,24 +1664,6 @@ build_joinrel_partition_info(RelOptInfo *joinrel, RelOptInfo *outer_rel,
part_scheme = outer_rel->part_scheme;
Assert(REL_HAS_ALL_PART_PROPS(outer_rel) &&
REL_HAS_ALL_PART_PROPS(inner_rel));
/*
* For now, our partition matching algorithm can match partitions only
* when the partition bounds of the joining relations are exactly same.
* So, bail out otherwise.
*/
if (outer_rel->nparts != inner_rel->nparts ||
!partition_bounds_equal(part_scheme->partnatts,
part_scheme->parttyplen,
part_scheme->parttypbyval,
outer_rel->boundinfo, inner_rel->boundinfo))
{
Assert(!IS_PARTITIONED_REL(joinrel));
return;
}
/*
* This function will be called only once for each joinrel, hence it
* should not have partitioning fields filled yet.
@ -1685,18 +1673,15 @@ build_joinrel_partition_info(RelOptInfo *joinrel, RelOptInfo *outer_rel,
!joinrel->boundinfo);
/*
* Join relation is partitioned using the same partitioning scheme as the
* joining relations and has same bounds.
* If the join relation is partitioned, it uses the same partitioning
* scheme as the joining relations.
*
* Note: we calculate the partition bounds, number of partitions, and
* child-join relations of the join relation in try_partitionwise_join().
*/
joinrel->part_scheme = part_scheme;
joinrel->boundinfo = outer_rel->boundinfo;
joinrel->nparts = outer_rel->nparts;
set_joinrel_partition_key_exprs(joinrel, outer_rel, inner_rel, jointype);
/* part_rels[] will be filled later, but allocate it now */
joinrel->part_rels =
(RelOptInfo **) palloc0(sizeof(RelOptInfo *) * joinrel->nparts);
/*
* Set the consider_partitionwise_join flag.
*/

File diff suppressed because it is too large Load Diff