mirror of
https://github.com/postgres/postgres.git
synced 2025-07-08 11:42:09 +03:00
Basic partition-wise join functionality.
Instead of joining two partitioned tables in their entirety we can, if it is an equi-join on the partition keys, join the matching partitions individually. This involves teaching the planner about "other join" rels, which are related to regular join rels in the same way that other member rels are related to baserels. This can use significantly more CPU time and memory than regular join planning, because there may now be a set of "other" rels not only for every base relation but also for every join relation. In most practical cases, this probably shouldn't be a problem, because (1) it's probably unusual to join many tables each with many partitions using the partition keys for all joins and (2) if you do that scenario then you probably have a big enough machine to handle the increased memory cost of planning and (3) the resulting plan is highly likely to be better, so what you spend in planning you'll make up on the execution side. All the same, for now, turn this feature off by default. Currently, we can only perform joins between two tables whose partitioning schemes are absolutely identical. It would be nice to cope with other scenarios, such as extra partitions on one side or the other with no match on the other side, but that will have to wait for a future patch. Ashutosh Bapat, reviewed and tested by Rajkumar Raghuwanshi, Amit Langote, Rafia Sabih, Thomas Munro, Dilip Kumar, Antonin Houska, Amit Khandekar, and by me. A few final adjustments by me. Discussion: http://postgr.es/m/CAFjFpRfQ8GrQvzp3jA2wnLqrHmaXna-urjm_UY9BqXj=EaDTSA@mail.gmail.com Discussion: http://postgr.es/m/CAFjFpRcitjfrULr5jfuKWRPsGUX0LQ0k8-yG0Qw2+1LBGNpMdw@mail.gmail.com
This commit is contained in:
@ -920,12 +920,79 @@ set_append_rel_size(PlannerInfo *root, RelOptInfo *rel,
|
||||
childrel = find_base_rel(root, childRTindex);
|
||||
Assert(childrel->reloptkind == RELOPT_OTHER_MEMBER_REL);
|
||||
|
||||
if (rel->part_scheme)
|
||||
{
|
||||
AttrNumber attno;
|
||||
|
||||
/*
|
||||
* We need attr_needed data for building targetlist of a join
|
||||
* relation representing join between matching partitions for
|
||||
* partition-wise join. A given attribute of a child will be
|
||||
* needed in the same highest joinrel where the corresponding
|
||||
* attribute of parent is needed. Hence it suffices to use the
|
||||
* same Relids set for parent and child.
|
||||
*/
|
||||
for (attno = rel->min_attr; attno <= rel->max_attr; attno++)
|
||||
{
|
||||
int index = attno - rel->min_attr;
|
||||
Relids attr_needed = rel->attr_needed[index];
|
||||
|
||||
/* System attributes do not need translation. */
|
||||
if (attno <= 0)
|
||||
{
|
||||
Assert(rel->min_attr == childrel->min_attr);
|
||||
childrel->attr_needed[index] = attr_needed;
|
||||
}
|
||||
else
|
||||
{
|
||||
Var *var = list_nth_node(Var,
|
||||
appinfo->translated_vars,
|
||||
attno - 1);
|
||||
int child_index;
|
||||
|
||||
child_index = var->varattno - childrel->min_attr;
|
||||
childrel->attr_needed[child_index] = attr_needed;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* We have to copy the parent's targetlist and quals to the child,
|
||||
* with appropriate substitution of variables. However, only the
|
||||
* baserestrictinfo quals are needed before we can check for
|
||||
* constraint exclusion; so do that first and then check to see if we
|
||||
* can disregard this child.
|
||||
* Copy/Modify targetlist. Even if this child is deemed empty, we need
|
||||
* its targetlist in case it falls on nullable side in a child-join
|
||||
* because of partition-wise join.
|
||||
*
|
||||
* NB: the resulting childrel->reltarget->exprs may contain arbitrary
|
||||
* expressions, which otherwise would not occur in a rel's targetlist.
|
||||
* Code that might be looking at an appendrel child must cope with
|
||||
* such. (Normally, a rel's targetlist would only include Vars and
|
||||
* PlaceHolderVars.) XXX we do not bother to update the cost or width
|
||||
* fields of childrel->reltarget; not clear if that would be useful.
|
||||
*/
|
||||
childrel->reltarget->exprs = (List *)
|
||||
adjust_appendrel_attrs(root,
|
||||
(Node *) rel->reltarget->exprs,
|
||||
1, &appinfo);
|
||||
|
||||
/*
|
||||
* We have to make child entries in the EquivalenceClass data
|
||||
* structures as well. This is needed either if the parent
|
||||
* participates in some eclass joins (because we will want to consider
|
||||
* inner-indexscan joins on the individual children) or if the parent
|
||||
* has useful pathkeys (because we should try to build MergeAppend
|
||||
* paths that produce those sort orderings). Even if this child is
|
||||
* deemed dummy, it may fall on nullable side in a child-join, which
|
||||
* in turn may participate in a MergeAppend, where we will need the
|
||||
* EquivalenceClass data structures.
|
||||
*/
|
||||
if (rel->has_eclass_joins || has_useful_pathkeys(root, rel))
|
||||
add_child_rel_equivalences(root, appinfo, rel, childrel);
|
||||
childrel->has_eclass_joins = rel->has_eclass_joins;
|
||||
|
||||
/*
|
||||
* We have to copy the parent's quals to the child, with appropriate
|
||||
* substitution of variables. However, only the baserestrictinfo
|
||||
* quals are needed before we can check for constraint exclusion; so
|
||||
* do that first and then check to see if we can disregard this child.
|
||||
*
|
||||
* The child rel's targetlist might contain non-Var expressions, which
|
||||
* means that substitution into the quals could produce opportunities
|
||||
@ -1052,44 +1119,11 @@ set_append_rel_size(PlannerInfo *root, RelOptInfo *rel,
|
||||
continue;
|
||||
}
|
||||
|
||||
/*
|
||||
* CE failed, so finish copying/modifying targetlist and join quals.
|
||||
*
|
||||
* NB: the resulting childrel->reltarget->exprs may contain arbitrary
|
||||
* expressions, which otherwise would not occur in a rel's targetlist.
|
||||
* Code that might be looking at an appendrel child must cope with
|
||||
* such. (Normally, a rel's targetlist would only include Vars and
|
||||
* PlaceHolderVars.) XXX we do not bother to update the cost or width
|
||||
* fields of childrel->reltarget; not clear if that would be useful.
|
||||
*/
|
||||
/* CE failed, so finish copying/modifying join quals. */
|
||||
childrel->joininfo = (List *)
|
||||
adjust_appendrel_attrs(root,
|
||||
(Node *) rel->joininfo,
|
||||
1, &appinfo);
|
||||
childrel->reltarget->exprs = (List *)
|
||||
adjust_appendrel_attrs(root,
|
||||
(Node *) rel->reltarget->exprs,
|
||||
1, &appinfo);
|
||||
|
||||
/*
|
||||
* We have to make child entries in the EquivalenceClass data
|
||||
* structures as well. This is needed either if the parent
|
||||
* participates in some eclass joins (because we will want to consider
|
||||
* inner-indexscan joins on the individual children) or if the parent
|
||||
* has useful pathkeys (because we should try to build MergeAppend
|
||||
* paths that produce those sort orderings).
|
||||
*/
|
||||
if (rel->has_eclass_joins || has_useful_pathkeys(root, rel))
|
||||
add_child_rel_equivalences(root, appinfo, rel, childrel);
|
||||
childrel->has_eclass_joins = rel->has_eclass_joins;
|
||||
|
||||
/*
|
||||
* Note: we could compute appropriate attr_needed data for the child's
|
||||
* variables, by transforming the parent's attr_needed through the
|
||||
* translated_vars mapping. However, currently there's no need
|
||||
* because attr_needed is only examined for base relations not
|
||||
* otherrels. So we just leave the child's attr_needed empty.
|
||||
*/
|
||||
|
||||
/*
|
||||
* If parallelism is allowable for this query in general, see whether
|
||||
@ -1262,14 +1296,14 @@ set_append_rel_pathlist(PlannerInfo *root, RelOptInfo *rel,
|
||||
live_childrels = lappend(live_childrels, childrel);
|
||||
}
|
||||
|
||||
/* Add paths to the "append" relation. */
|
||||
/* Add paths to the append relation. */
|
||||
add_paths_to_append_rel(root, rel, live_childrels);
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* add_paths_to_append_rel
|
||||
* Generate paths for given "append" relation given the set of non-dummy
|
||||
* Generate paths for the given append relation given the set of non-dummy
|
||||
* child rels.
|
||||
*
|
||||
* The function collects all parameterizations and orderings supported by the
|
||||
@ -1293,30 +1327,44 @@ add_paths_to_append_rel(PlannerInfo *root, RelOptInfo *rel,
|
||||
RangeTblEntry *rte;
|
||||
bool build_partitioned_rels = false;
|
||||
|
||||
/*
|
||||
* A root partition will already have a PartitionedChildRelInfo, and a
|
||||
* non-root partitioned table doesn't need one, because its Append paths
|
||||
* will get flattened into the parent anyway. For a subquery RTE, no
|
||||
* PartitionedChildRelInfo exists; we collect all partitioned_rels
|
||||
* associated with any child. (This assumes that we don't need to look
|
||||
* through multiple levels of subquery RTEs; if we ever do, we could
|
||||
* create a PartitionedChildRelInfo with the accumulated list of
|
||||
* partitioned_rels which would then be found when populated our parent
|
||||
* rel with paths. For the present, that appears to be unnecessary.)
|
||||
*/
|
||||
rte = planner_rt_fetch(rel->relid, root);
|
||||
switch (rte->rtekind)
|
||||
if (IS_SIMPLE_REL(rel))
|
||||
{
|
||||
case RTE_RELATION:
|
||||
if (rte->relkind == RELKIND_PARTITIONED_TABLE)
|
||||
partitioned_rels =
|
||||
get_partitioned_child_rels(root, rel->relid);
|
||||
break;
|
||||
case RTE_SUBQUERY:
|
||||
build_partitioned_rels = true;
|
||||
break;
|
||||
default:
|
||||
elog(ERROR, "unexpected rtekind: %d", (int) rte->rtekind);
|
||||
/*
|
||||
* A root partition will already have a PartitionedChildRelInfo, and a
|
||||
* non-root partitioned table doesn't need one, because its Append
|
||||
* paths will get flattened into the parent anyway. For a subquery
|
||||
* RTE, no PartitionedChildRelInfo exists; we collect all
|
||||
* partitioned_rels associated with any child. (This assumes that we
|
||||
* don't need to look through multiple levels of subquery RTEs; if we
|
||||
* ever do, we could create a PartitionedChildRelInfo with the
|
||||
* accumulated list of partitioned_rels which would then be found when
|
||||
* populated our parent rel with paths. For the present, that appears
|
||||
* to be unnecessary.)
|
||||
*/
|
||||
rte = planner_rt_fetch(rel->relid, root);
|
||||
switch (rte->rtekind)
|
||||
{
|
||||
case RTE_RELATION:
|
||||
if (rte->relkind == RELKIND_PARTITIONED_TABLE)
|
||||
partitioned_rels =
|
||||
get_partitioned_child_rels(root, rel->relid);
|
||||
break;
|
||||
case RTE_SUBQUERY:
|
||||
build_partitioned_rels = true;
|
||||
break;
|
||||
default:
|
||||
elog(ERROR, "unexpcted rtekind: %d", (int) rte->rtekind);
|
||||
}
|
||||
}
|
||||
else if (rel->reloptkind == RELOPT_JOINREL && rel->part_scheme)
|
||||
{
|
||||
/*
|
||||
* Associate PartitionedChildRelInfo of the root partitioned tables
|
||||
* being joined with the root partitioned join (indicated by
|
||||
* RELOPT_JOINREL).
|
||||
*/
|
||||
partitioned_rels = get_partitioned_child_rels_for_join(root,
|
||||
rel->relids);
|
||||
}
|
||||
|
||||
/*
|
||||
@ -2422,16 +2470,22 @@ standard_join_search(PlannerInfo *root, int levels_needed, List *initial_rels)
|
||||
join_search_one_level(root, lev);
|
||||
|
||||
/*
|
||||
* Run generate_gather_paths() for each just-processed joinrel. We
|
||||
* could not do this earlier because both regular and partial paths
|
||||
* can get added to a particular joinrel at multiple times within
|
||||
* join_search_one_level. After that, we're done creating paths for
|
||||
* the joinrel, so run set_cheapest().
|
||||
* Run generate_partition_wise_join_paths() and
|
||||
* generate_gather_paths() for each just-processed joinrel. We could
|
||||
* not do this earlier because both regular and partial paths can get
|
||||
* added to a particular joinrel at multiple times within
|
||||
* join_search_one_level.
|
||||
*
|
||||
* After that, we're done creating paths for the joinrel, so run
|
||||
* set_cheapest().
|
||||
*/
|
||||
foreach(lc, root->join_rel_level[lev])
|
||||
{
|
||||
rel = (RelOptInfo *) lfirst(lc);
|
||||
|
||||
/* Create paths for partition-wise joins. */
|
||||
generate_partition_wise_join_paths(root, rel);
|
||||
|
||||
/* Create GatherPaths for any useful partial paths for rel */
|
||||
generate_gather_paths(root, rel);
|
||||
|
||||
@ -3179,6 +3233,82 @@ compute_parallel_worker(RelOptInfo *rel, double heap_pages, double index_pages)
|
||||
return parallel_workers;
|
||||
}
|
||||
|
||||
/*
|
||||
* generate_partition_wise_join_paths
|
||||
* Create paths representing partition-wise join for given partitioned
|
||||
* join relation.
|
||||
*
|
||||
* This must not be called until after we are done adding paths for all
|
||||
* child-joins. Otherwise, add_path might delete a path to which some path
|
||||
* generated here has a reference.
|
||||
*/
|
||||
void
|
||||
generate_partition_wise_join_paths(PlannerInfo *root, RelOptInfo *rel)
|
||||
{
|
||||
List *live_children = NIL;
|
||||
int cnt_parts;
|
||||
int num_parts;
|
||||
RelOptInfo **part_rels;
|
||||
|
||||
/* Handle only join relations here. */
|
||||
if (!IS_JOIN_REL(rel))
|
||||
return;
|
||||
|
||||
/*
|
||||
* If we've already proven this join is empty, we needn't consider any
|
||||
* more paths for it.
|
||||
*/
|
||||
if (IS_DUMMY_REL(rel))
|
||||
return;
|
||||
|
||||
/*
|
||||
* Nothing to do if the relation is not partitioned. An outer join
|
||||
* relation which had empty inner relation in every pair will have rest of
|
||||
* the partitioning properties set except the child-join RelOptInfos. See
|
||||
* try_partition_wise_join() for more explanation.
|
||||
*/
|
||||
if (rel->nparts <= 0 || rel->part_rels == NULL)
|
||||
return;
|
||||
|
||||
/* Guard against stack overflow due to overly deep partition hierarchy. */
|
||||
check_stack_depth();
|
||||
|
||||
num_parts = rel->nparts;
|
||||
part_rels = rel->part_rels;
|
||||
|
||||
/* Collect non-dummy child-joins. */
|
||||
for (cnt_parts = 0; cnt_parts < num_parts; cnt_parts++)
|
||||
{
|
||||
RelOptInfo *child_rel = part_rels[cnt_parts];
|
||||
|
||||
/* Add partition-wise join paths for partitioned child-joins. */
|
||||
generate_partition_wise_join_paths(root, child_rel);
|
||||
|
||||
/* Dummy children will not be scanned, so ingore those. */
|
||||
if (IS_DUMMY_REL(child_rel))
|
||||
continue;
|
||||
|
||||
set_cheapest(child_rel);
|
||||
|
||||
#ifdef OPTIMIZER_DEBUG
|
||||
debug_print_rel(root, rel);
|
||||
#endif
|
||||
|
||||
live_children = lappend(live_children, child_rel);
|
||||
}
|
||||
|
||||
/* If all child-joins are dummy, parent join is also dummy. */
|
||||
if (!live_children)
|
||||
{
|
||||
mark_dummy_rel(rel);
|
||||
return;
|
||||
}
|
||||
|
||||
/* Build additional paths for this rel from child-join paths. */
|
||||
add_paths_to_append_rel(root, rel, live_children);
|
||||
list_free(live_children);
|
||||
}
|
||||
|
||||
|
||||
/*****************************************************************************
|
||||
* DEBUG SUPPORT
|
||||
|
@ -127,6 +127,7 @@ bool enable_material = true;
|
||||
bool enable_mergejoin = true;
|
||||
bool enable_hashjoin = true;
|
||||
bool enable_gathermerge = true;
|
||||
bool enable_partition_wise_join = false;
|
||||
|
||||
typedef struct
|
||||
{
|
||||
|
@ -26,9 +26,19 @@
|
||||
/* Hook for plugins to get control in add_paths_to_joinrel() */
|
||||
set_join_pathlist_hook_type set_join_pathlist_hook = NULL;
|
||||
|
||||
#define PATH_PARAM_BY_REL(path, rel) \
|
||||
/*
|
||||
* Paths parameterized by the parent can be considered to be parameterized by
|
||||
* any of its child.
|
||||
*/
|
||||
#define PATH_PARAM_BY_PARENT(path, rel) \
|
||||
((path)->param_info && bms_overlap(PATH_REQ_OUTER(path), \
|
||||
(rel)->top_parent_relids))
|
||||
#define PATH_PARAM_BY_REL_SELF(path, rel) \
|
||||
((path)->param_info && bms_overlap(PATH_REQ_OUTER(path), (rel)->relids))
|
||||
|
||||
#define PATH_PARAM_BY_REL(path, rel) \
|
||||
(PATH_PARAM_BY_REL_SELF(path, rel) || PATH_PARAM_BY_PARENT(path, rel))
|
||||
|
||||
static void try_partial_mergejoin_path(PlannerInfo *root,
|
||||
RelOptInfo *joinrel,
|
||||
Path *outer_path,
|
||||
@ -115,6 +125,19 @@ add_paths_to_joinrel(PlannerInfo *root,
|
||||
JoinPathExtraData extra;
|
||||
bool mergejoin_allowed = true;
|
||||
ListCell *lc;
|
||||
Relids joinrelids;
|
||||
|
||||
/*
|
||||
* PlannerInfo doesn't contain the SpecialJoinInfos created for joins
|
||||
* between child relations, even if there is a SpecialJoinInfo node for
|
||||
* the join between the topmost parents. So, while calculating Relids set
|
||||
* representing the restriction, consider relids of topmost parent of
|
||||
* partitions.
|
||||
*/
|
||||
if (joinrel->reloptkind == RELOPT_OTHER_JOINREL)
|
||||
joinrelids = joinrel->top_parent_relids;
|
||||
else
|
||||
joinrelids = joinrel->relids;
|
||||
|
||||
extra.restrictlist = restrictlist;
|
||||
extra.mergeclause_list = NIL;
|
||||
@ -211,16 +234,16 @@ add_paths_to_joinrel(PlannerInfo *root,
|
||||
* join has already been proven legal.) If the SJ is relevant, it
|
||||
* presents constraints for joining to anything not in its RHS.
|
||||
*/
|
||||
if (bms_overlap(joinrel->relids, sjinfo2->min_righthand) &&
|
||||
!bms_overlap(joinrel->relids, sjinfo2->min_lefthand))
|
||||
if (bms_overlap(joinrelids, sjinfo2->min_righthand) &&
|
||||
!bms_overlap(joinrelids, sjinfo2->min_lefthand))
|
||||
extra.param_source_rels = bms_join(extra.param_source_rels,
|
||||
bms_difference(root->all_baserels,
|
||||
sjinfo2->min_righthand));
|
||||
|
||||
/* full joins constrain both sides symmetrically */
|
||||
if (sjinfo2->jointype == JOIN_FULL &&
|
||||
bms_overlap(joinrel->relids, sjinfo2->min_lefthand) &&
|
||||
!bms_overlap(joinrel->relids, sjinfo2->min_righthand))
|
||||
bms_overlap(joinrelids, sjinfo2->min_lefthand) &&
|
||||
!bms_overlap(joinrelids, sjinfo2->min_righthand))
|
||||
extra.param_source_rels = bms_join(extra.param_source_rels,
|
||||
bms_difference(root->all_baserels,
|
||||
sjinfo2->min_lefthand));
|
||||
@ -347,11 +370,25 @@ try_nestloop_path(PlannerInfo *root,
|
||||
JoinCostWorkspace workspace;
|
||||
RelOptInfo *innerrel = inner_path->parent;
|
||||
RelOptInfo *outerrel = outer_path->parent;
|
||||
Relids innerrelids = innerrel->relids;
|
||||
Relids outerrelids = outerrel->relids;
|
||||
Relids innerrelids;
|
||||
Relids outerrelids;
|
||||
Relids inner_paramrels = PATH_REQ_OUTER(inner_path);
|
||||
Relids outer_paramrels = PATH_REQ_OUTER(outer_path);
|
||||
|
||||
/*
|
||||
* Paths are parameterized by top-level parents, so run parameterization
|
||||
* tests on the parent relids.
|
||||
*/
|
||||
if (innerrel->top_parent_relids)
|
||||
innerrelids = innerrel->top_parent_relids;
|
||||
else
|
||||
innerrelids = innerrel->relids;
|
||||
|
||||
if (outerrel->top_parent_relids)
|
||||
outerrelids = outerrel->top_parent_relids;
|
||||
else
|
||||
outerrelids = outerrel->relids;
|
||||
|
||||
/*
|
||||
* Check to see if proposed path is still parameterized, and reject if the
|
||||
* parameterization wouldn't be sensible --- unless allow_star_schema_join
|
||||
@ -387,6 +424,27 @@ try_nestloop_path(PlannerInfo *root,
|
||||
workspace.startup_cost, workspace.total_cost,
|
||||
pathkeys, required_outer))
|
||||
{
|
||||
/*
|
||||
* If the inner path is parameterized, it is parameterized by the
|
||||
* topmost parent of the outer rel, not the outer rel itself. Fix
|
||||
* that.
|
||||
*/
|
||||
if (PATH_PARAM_BY_PARENT(inner_path, outer_path->parent))
|
||||
{
|
||||
inner_path = reparameterize_path_by_child(root, inner_path,
|
||||
outer_path->parent);
|
||||
|
||||
/*
|
||||
* If we could not translate the path, we can't create nest loop
|
||||
* path.
|
||||
*/
|
||||
if (!inner_path)
|
||||
{
|
||||
bms_free(required_outer);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
add_path(joinrel, (Path *)
|
||||
create_nestloop_path(root,
|
||||
joinrel,
|
||||
@ -432,8 +490,20 @@ try_partial_nestloop_path(PlannerInfo *root,
|
||||
if (inner_path->param_info != NULL)
|
||||
{
|
||||
Relids inner_paramrels = inner_path->param_info->ppi_req_outer;
|
||||
RelOptInfo *outerrel = outer_path->parent;
|
||||
Relids outerrelids;
|
||||
|
||||
if (!bms_is_subset(inner_paramrels, outer_path->parent->relids))
|
||||
/*
|
||||
* The inner and outer paths are parameterized, if at all, by the top
|
||||
* level parents, not the child relations, so we must use those relids
|
||||
* for our paramaterization tests.
|
||||
*/
|
||||
if (outerrel->top_parent_relids)
|
||||
outerrelids = outerrel->top_parent_relids;
|
||||
else
|
||||
outerrelids = outerrel->relids;
|
||||
|
||||
if (!bms_is_subset(inner_paramrels, outerrelids))
|
||||
return;
|
||||
}
|
||||
|
||||
@ -446,6 +516,22 @@ try_partial_nestloop_path(PlannerInfo *root,
|
||||
if (!add_partial_path_precheck(joinrel, workspace.total_cost, pathkeys))
|
||||
return;
|
||||
|
||||
/*
|
||||
* If the inner path is parameterized, it is parameterized by the topmost
|
||||
* parent of the outer rel, not the outer rel itself. Fix that.
|
||||
*/
|
||||
if (PATH_PARAM_BY_PARENT(inner_path, outer_path->parent))
|
||||
{
|
||||
inner_path = reparameterize_path_by_child(root, inner_path,
|
||||
outer_path->parent);
|
||||
|
||||
/*
|
||||
* If we could not translate the path, we can't create nest loop path.
|
||||
*/
|
||||
if (!inner_path)
|
||||
return;
|
||||
}
|
||||
|
||||
/* Might be good enough to be worth trying, so let's try it. */
|
||||
add_partial_path(joinrel, (Path *)
|
||||
create_nestloop_path(root,
|
||||
|
@ -14,10 +14,17 @@
|
||||
*/
|
||||
#include "postgres.h"
|
||||
|
||||
#include "miscadmin.h"
|
||||
#include "catalog/partition.h"
|
||||
#include "nodes/relation.h"
|
||||
#include "optimizer/clauses.h"
|
||||
#include "optimizer/joininfo.h"
|
||||
#include "optimizer/pathnode.h"
|
||||
#include "optimizer/paths.h"
|
||||
#include "optimizer/prep.h"
|
||||
#include "optimizer/cost.h"
|
||||
#include "utils/memutils.h"
|
||||
#include "utils/lsyscache.h"
|
||||
|
||||
|
||||
static void make_rels_by_clause_joins(PlannerInfo *root,
|
||||
@ -29,12 +36,17 @@ static void make_rels_by_clauseless_joins(PlannerInfo *root,
|
||||
static bool has_join_restriction(PlannerInfo *root, RelOptInfo *rel);
|
||||
static bool has_legal_joinclause(PlannerInfo *root, RelOptInfo *rel);
|
||||
static bool is_dummy_rel(RelOptInfo *rel);
|
||||
static void mark_dummy_rel(RelOptInfo *rel);
|
||||
static bool restriction_is_constant_false(List *restrictlist,
|
||||
bool only_pushed_down);
|
||||
static void populate_joinrel_with_paths(PlannerInfo *root, RelOptInfo *rel1,
|
||||
RelOptInfo *rel2, RelOptInfo *joinrel,
|
||||
SpecialJoinInfo *sjinfo, List *restrictlist);
|
||||
static void try_partition_wise_join(PlannerInfo *root, RelOptInfo *rel1,
|
||||
RelOptInfo *rel2, RelOptInfo *joinrel,
|
||||
SpecialJoinInfo *parent_sjinfo,
|
||||
List *parent_restrictlist);
|
||||
static int match_expr_to_partition_keys(Expr *expr, RelOptInfo *rel,
|
||||
bool strict_op);
|
||||
|
||||
|
||||
/*
|
||||
@ -892,6 +904,9 @@ populate_joinrel_with_paths(PlannerInfo *root, RelOptInfo *rel1,
|
||||
elog(ERROR, "unrecognized join type: %d", (int) sjinfo->jointype);
|
||||
break;
|
||||
}
|
||||
|
||||
/* Apply partition-wise join technique, if possible. */
|
||||
try_partition_wise_join(root, rel1, rel2, joinrel, sjinfo, restrictlist);
|
||||
}
|
||||
|
||||
|
||||
@ -1197,7 +1212,7 @@ is_dummy_rel(RelOptInfo *rel)
|
||||
* is that the best solution is to explicitly make the dummy path in the same
|
||||
* context the given RelOptInfo is in.
|
||||
*/
|
||||
static void
|
||||
void
|
||||
mark_dummy_rel(RelOptInfo *rel)
|
||||
{
|
||||
MemoryContext oldcontext;
|
||||
@ -1268,3 +1283,300 @@ restriction_is_constant_false(List *restrictlist, bool only_pushed_down)
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/*
|
||||
* Assess whether join between given two partitioned relations can be broken
|
||||
* down into joins between matching partitions; a technique called
|
||||
* "partition-wise join"
|
||||
*
|
||||
* Partition-wise join is possible when a. Joining relations have same
|
||||
* partitioning scheme b. There exists an equi-join between the partition keys
|
||||
* of the two relations.
|
||||
*
|
||||
* Partition-wise join is planned as follows (details: optimizer/README.)
|
||||
*
|
||||
* 1. Create the RelOptInfos for joins between matching partitions i.e
|
||||
* child-joins and add paths to them.
|
||||
*
|
||||
* 2. Construct Append or MergeAppend paths across the set of child joins.
|
||||
* This second phase is implemented by generate_partition_wise_join_paths().
|
||||
*
|
||||
* The RelOptInfo, SpecialJoinInfo and restrictlist for each child join are
|
||||
* obtained by translating the respective parent join structures.
|
||||
*/
|
||||
static void
|
||||
try_partition_wise_join(PlannerInfo *root, RelOptInfo *rel1, RelOptInfo *rel2,
|
||||
RelOptInfo *joinrel, SpecialJoinInfo *parent_sjinfo,
|
||||
List *parent_restrictlist)
|
||||
{
|
||||
int nparts;
|
||||
int cnt_parts;
|
||||
|
||||
/* Guard against stack overflow due to overly deep partition hierarchy. */
|
||||
check_stack_depth();
|
||||
|
||||
/* Nothing to do, if the join relation is not partitioned. */
|
||||
if (!IS_PARTITIONED_REL(joinrel))
|
||||
return;
|
||||
|
||||
/*
|
||||
* set_rel_pathlist() may not create paths in children of an empty
|
||||
* partitioned table and so we can not add paths to child-joins. So, deem
|
||||
* such a join as unpartitioned. When a partitioned relation is deemed
|
||||
* empty because all its children are empty, dummy path will be set in
|
||||
* each of the children. In such a case we could still consider the join
|
||||
* as partitioned, but it might not help much.
|
||||
*/
|
||||
if (IS_DUMMY_REL(rel1) || IS_DUMMY_REL(rel2))
|
||||
return;
|
||||
|
||||
/*
|
||||
* Since this join relation is partitioned, all the base relations
|
||||
* participating in this join must be partitioned and so are all the
|
||||
* intermediate join relations.
|
||||
*/
|
||||
Assert(IS_PARTITIONED_REL(rel1) && IS_PARTITIONED_REL(rel2));
|
||||
Assert(REL_HAS_ALL_PART_PROPS(rel1) && REL_HAS_ALL_PART_PROPS(rel2));
|
||||
|
||||
/*
|
||||
* The partition scheme of the join relation should match that of the
|
||||
* joining relations.
|
||||
*/
|
||||
Assert(joinrel->part_scheme == rel1->part_scheme &&
|
||||
joinrel->part_scheme == rel2->part_scheme);
|
||||
|
||||
/*
|
||||
* Since we allow partition-wise join only when the partition bounds of
|
||||
* the joining relations exactly match, the partition bounds of the join
|
||||
* should match those of the joining relations.
|
||||
*/
|
||||
Assert(partition_bounds_equal(joinrel->part_scheme->partnatts,
|
||||
joinrel->part_scheme->parttyplen,
|
||||
joinrel->part_scheme->parttypbyval,
|
||||
joinrel->boundinfo, rel1->boundinfo));
|
||||
Assert(partition_bounds_equal(joinrel->part_scheme->partnatts,
|
||||
joinrel->part_scheme->parttyplen,
|
||||
joinrel->part_scheme->parttypbyval,
|
||||
joinrel->boundinfo, rel2->boundinfo));
|
||||
|
||||
nparts = joinrel->nparts;
|
||||
|
||||
/* Allocate space to hold child-joins RelOptInfos, if not already done. */
|
||||
if (!joinrel->part_rels)
|
||||
joinrel->part_rels =
|
||||
(RelOptInfo **) palloc0(sizeof(RelOptInfo *) * nparts);
|
||||
|
||||
/*
|
||||
* Create child-join relations for this partitioned join, if those don't
|
||||
* exist. Add paths to child-joins for a pair of child relations
|
||||
* corresponding to the given pair of parent relations.
|
||||
*/
|
||||
for (cnt_parts = 0; cnt_parts < nparts; cnt_parts++)
|
||||
{
|
||||
RelOptInfo *child_rel1 = rel1->part_rels[cnt_parts];
|
||||
RelOptInfo *child_rel2 = rel2->part_rels[cnt_parts];
|
||||
SpecialJoinInfo *child_sjinfo;
|
||||
List *child_restrictlist;
|
||||
RelOptInfo *child_joinrel;
|
||||
Relids child_joinrelids;
|
||||
AppendRelInfo **appinfos;
|
||||
int nappinfos;
|
||||
|
||||
/* We should never try to join two overlapping sets of rels. */
|
||||
Assert(!bms_overlap(child_rel1->relids, child_rel2->relids));
|
||||
child_joinrelids = bms_union(child_rel1->relids, child_rel2->relids);
|
||||
appinfos = find_appinfos_by_relids(root, child_joinrelids, &nappinfos);
|
||||
|
||||
/*
|
||||
* Construct SpecialJoinInfo from parent join relations's
|
||||
* SpecialJoinInfo.
|
||||
*/
|
||||
child_sjinfo = build_child_join_sjinfo(root, parent_sjinfo,
|
||||
child_rel1->relids,
|
||||
child_rel2->relids);
|
||||
|
||||
/*
|
||||
* Construct restrictions applicable to the child join from those
|
||||
* applicable to the parent join.
|
||||
*/
|
||||
child_restrictlist =
|
||||
(List *) adjust_appendrel_attrs(root,
|
||||
(Node *) parent_restrictlist,
|
||||
nappinfos, appinfos);
|
||||
pfree(appinfos);
|
||||
|
||||
child_joinrel = joinrel->part_rels[cnt_parts];
|
||||
if (!child_joinrel)
|
||||
{
|
||||
child_joinrel = build_child_join_rel(root, child_rel1, child_rel2,
|
||||
joinrel, child_restrictlist,
|
||||
child_sjinfo,
|
||||
child_sjinfo->jointype);
|
||||
joinrel->part_rels[cnt_parts] = child_joinrel;
|
||||
}
|
||||
|
||||
Assert(bms_equal(child_joinrel->relids, child_joinrelids));
|
||||
|
||||
populate_joinrel_with_paths(root, child_rel1, child_rel2,
|
||||
child_joinrel, child_sjinfo,
|
||||
child_restrictlist);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Returns true if there exists an equi-join condition for each pair of
|
||||
* partition keys from given relations being joined.
|
||||
*/
|
||||
bool
|
||||
have_partkey_equi_join(RelOptInfo *rel1, RelOptInfo *rel2, JoinType jointype,
|
||||
List *restrictlist)
|
||||
{
|
||||
PartitionScheme part_scheme = rel1->part_scheme;
|
||||
ListCell *lc;
|
||||
int cnt_pks;
|
||||
bool pk_has_clause[PARTITION_MAX_KEYS];
|
||||
bool strict_op;
|
||||
|
||||
/*
|
||||
* This function should be called when the joining relations have same
|
||||
* partitioning scheme.
|
||||
*/
|
||||
Assert(rel1->part_scheme == rel2->part_scheme);
|
||||
Assert(part_scheme);
|
||||
|
||||
memset(pk_has_clause, 0, sizeof(pk_has_clause));
|
||||
foreach(lc, restrictlist)
|
||||
{
|
||||
RestrictInfo *rinfo = lfirst_node(RestrictInfo, lc);
|
||||
OpExpr *opexpr;
|
||||
Expr *expr1;
|
||||
Expr *expr2;
|
||||
int ipk1;
|
||||
int ipk2;
|
||||
|
||||
/* If processing an outer join, only use its own join clauses. */
|
||||
if (IS_OUTER_JOIN(jointype) && rinfo->is_pushed_down)
|
||||
continue;
|
||||
|
||||
/* Skip clauses which can not be used for a join. */
|
||||
if (!rinfo->can_join)
|
||||
continue;
|
||||
|
||||
/* Skip clauses which are not equality conditions. */
|
||||
if (!rinfo->mergeopfamilies)
|
||||
continue;
|
||||
|
||||
opexpr = (OpExpr *) rinfo->clause;
|
||||
Assert(is_opclause(opexpr));
|
||||
|
||||
/*
|
||||
* The equi-join between partition keys is strict if equi-join between
|
||||
* at least one partition key is using a strict operator. See
|
||||
* explanation about outer join reordering identity 3 in
|
||||
* optimizer/README
|
||||
*/
|
||||
strict_op = op_strict(opexpr->opno);
|
||||
|
||||
/* Match the operands to the relation. */
|
||||
if (bms_is_subset(rinfo->left_relids, rel1->relids) &&
|
||||
bms_is_subset(rinfo->right_relids, rel2->relids))
|
||||
{
|
||||
expr1 = linitial(opexpr->args);
|
||||
expr2 = lsecond(opexpr->args);
|
||||
}
|
||||
else if (bms_is_subset(rinfo->left_relids, rel2->relids) &&
|
||||
bms_is_subset(rinfo->right_relids, rel1->relids))
|
||||
{
|
||||
expr1 = lsecond(opexpr->args);
|
||||
expr2 = linitial(opexpr->args);
|
||||
}
|
||||
else
|
||||
continue;
|
||||
|
||||
/*
|
||||
* Only clauses referencing the partition keys are useful for
|
||||
* partition-wise join.
|
||||
*/
|
||||
ipk1 = match_expr_to_partition_keys(expr1, rel1, strict_op);
|
||||
if (ipk1 < 0)
|
||||
continue;
|
||||
ipk2 = match_expr_to_partition_keys(expr2, rel2, strict_op);
|
||||
if (ipk2 < 0)
|
||||
continue;
|
||||
|
||||
/*
|
||||
* If the clause refers to keys at different ordinal positions, it can
|
||||
* not be used for partition-wise join.
|
||||
*/
|
||||
if (ipk1 != ipk2)
|
||||
continue;
|
||||
|
||||
/*
|
||||
* The clause allows partition-wise join if only it uses the same
|
||||
* operator family as that specified by the partition key.
|
||||
*/
|
||||
if (!list_member_oid(rinfo->mergeopfamilies,
|
||||
part_scheme->partopfamily[ipk1]))
|
||||
continue;
|
||||
|
||||
/* Mark the partition key as having an equi-join clause. */
|
||||
pk_has_clause[ipk1] = true;
|
||||
}
|
||||
|
||||
/* Check whether every partition key has an equi-join condition. */
|
||||
for (cnt_pks = 0; cnt_pks < part_scheme->partnatts; cnt_pks++)
|
||||
{
|
||||
if (!pk_has_clause[cnt_pks])
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/*
|
||||
* Find the partition key from the given relation matching the given
|
||||
* expression. If found, return the index of the partition key, else return -1.
|
||||
*/
|
||||
static int
|
||||
match_expr_to_partition_keys(Expr *expr, RelOptInfo *rel, bool strict_op)
|
||||
{
|
||||
int cnt;
|
||||
|
||||
/* This function should be called only for partitioned relations. */
|
||||
Assert(rel->part_scheme);
|
||||
|
||||
/* Remove any relabel decorations. */
|
||||
while (IsA(expr, RelabelType))
|
||||
expr = (Expr *) (castNode(RelabelType, expr))->arg;
|
||||
|
||||
for (cnt = 0; cnt < rel->part_scheme->partnatts; cnt++)
|
||||
{
|
||||
ListCell *lc;
|
||||
|
||||
Assert(rel->partexprs);
|
||||
foreach(lc, rel->partexprs[cnt])
|
||||
{
|
||||
if (equal(lfirst(lc), expr))
|
||||
return cnt;
|
||||
}
|
||||
|
||||
if (!strict_op)
|
||||
continue;
|
||||
|
||||
/*
|
||||
* If it's a strict equi-join a NULL partition key on one side will
|
||||
* not join a NULL partition key on the other side. So, rows with NULL
|
||||
* partition key from a partition on one side can not join with those
|
||||
* from a non-matching partition on the other side. So, search the
|
||||
* nullable partition keys as well.
|
||||
*/
|
||||
Assert(rel->nullable_partexprs);
|
||||
foreach(lc, rel->nullable_partexprs[cnt])
|
||||
{
|
||||
if (equal(lfirst(lc), expr))
|
||||
return cnt;
|
||||
}
|
||||
}
|
||||
|
||||
return -1;
|
||||
}
|
||||
|
Reference in New Issue
Block a user