1
0
mirror of https://github.com/postgres/postgres.git synced 2025-08-27 07:42:10 +03:00

Fix hash partition pruning with asymmetric partition sets.

perform_pruning_combine_step() was not taught about the number of
partition indexes used in hash partitioning; more embarrassingly,
get_matching_hash_bounds() also had it wrong.  These errors are masked
in the common case where all the partitions have the same modulus
and no partition is missing.  However, with missing or unequal-size
partitions, we could erroneously prune some partitions that need
to be scanned, leading to silently wrong query answers.

While a minimal-footprint fix for this could be to export
get_partition_bound_num_indexes and make the incorrect functions use it,
I'm of the opinion that that function should never have existed in the
first place.  It's not reasonable data structure design that
PartitionBoundInfoData lacks any explicit record of the length of
its indexes[] array.  Perhaps that was all right when it could always
be assumed equal to ndatums, but something should have been done about
it as soon as that stopped being true.  Putting in an explicit
"nindexes" field makes both partition_bounds_equal() and
partition_bounds_copy() simpler, safer, and faster than before,
and removes explicit knowledge of the number-of-partition-indexes
rules from some other places too.

This change also makes get_hash_partition_greatest_modulus obsolete.
I left that in place in case any external code uses it, but no core
code does anymore.

Per bug #16840 from Michał Albrycht.  Back-patch to v11 where the
hash partitioning code came in.  (In the back branches, add the new
field at the end of PartitionBoundInfoData to minimize ABI risks.)

Discussion: https://postgr.es/m/16840-571a22976f829ad4@postgresql.org
This commit is contained in:
Tom Lane
2021-01-28 13:41:55 -05:00
parent 1b242f42ba
commit 1d9351a87c
6 changed files with 125 additions and 142 deletions

View File

@@ -224,7 +224,6 @@ static int partition_range_bsearch(int partnatts, FmgrInfo *partsupfunc,
Oid *partcollation,
PartitionBoundInfo boundinfo,
PartitionRangeBound *probe, int32 *cmpval);
static int get_partition_bound_num_indexes(PartitionBoundInfo b);
static Expr *make_partition_op_expr(PartitionKey key, int keynum,
uint16 strategy, Expr *arg1, Expr *arg2);
static Oid get_partition_operator(PartitionKey key, int col,
@@ -398,6 +397,7 @@ create_hash_bounds(PartitionBoundSpec **boundspecs, int nparts,
boundinfo->ndatums = ndatums;
boundinfo->datums = (Datum **) palloc0(ndatums * sizeof(Datum *));
boundinfo->nindexes = greatest_modulus;
boundinfo->indexes = (int *) palloc(greatest_modulus * sizeof(int));
for (i = 0; i < greatest_modulus; i++)
boundinfo->indexes[i] = -1;
@@ -530,6 +530,7 @@ create_list_bounds(PartitionBoundSpec **boundspecs, int nparts,
boundinfo->ndatums = ndatums;
boundinfo->datums = (Datum **) palloc0(ndatums * sizeof(Datum *));
boundinfo->nindexes = ndatums;
boundinfo->indexes = (int *) palloc(ndatums * sizeof(int));
/*
@@ -725,8 +726,9 @@ create_range_bounds(PartitionBoundSpec **boundspecs, int nparts,
/*
* For range partitioning, an additional value of -1 is stored as the last
* element.
* element of the indexes[] array.
*/
boundinfo->nindexes = ndatums + 1;
boundinfo->indexes = (int *) palloc((ndatums + 1) * sizeof(int));
for (i = 0; i < ndatums; i++)
@@ -807,45 +809,41 @@ partition_bounds_equal(int partnatts, int16 *parttyplen, bool *parttypbyval,
if (b1->ndatums != b2->ndatums)
return false;
if (b1->nindexes != b2->nindexes)
return false;
if (b1->null_index != b2->null_index)
return false;
if (b1->default_index != b2->default_index)
return false;
/* For all partition strategies, the indexes[] arrays have to match */
for (i = 0; i < b1->nindexes; i++)
{
if (b1->indexes[i] != b2->indexes[i])
return false;
}
/* Finally, compare the datums[] arrays */
if (b1->strategy == PARTITION_STRATEGY_HASH)
{
int greatest_modulus = get_hash_partition_greatest_modulus(b1);
/*
* If two hash partitioned tables have different greatest moduli,
* their partition schemes don't match.
*/
if (greatest_modulus != get_hash_partition_greatest_modulus(b2))
return false;
/*
* We arrange the partitions in the ascending order of their moduli
* and remainders. Also every modulus is factor of next larger
* modulus. Therefore we can safely store index of a given partition
* in indexes array at remainder of that partition. Also entries at
* (remainder + N * modulus) positions in indexes array are all same
* for (modulus, remainder) specification for any partition. Thus
* datums array from both the given bounds are same, if and only if
* their indexes array will be same. So, it suffices to compare
* indexes array.
*/
for (i = 0; i < greatest_modulus; i++)
if (b1->indexes[i] != b2->indexes[i])
return false;
#ifdef USE_ASSERT_CHECKING
/*
* Nonetheless make sure that the bounds are indeed same when the
* for (modulus, remainder) specification for any partition. Thus the
* datums arrays from the given bounds are the same, if and only if
* their indexes arrays are the same. So, it suffices to compare the
* indexes arrays.
*
* Nonetheless make sure that the bounds are indeed the same when the
* indexes match. Hash partition bound stores modulus and remainder
* at b1->datums[i][0] and b1->datums[i][1] position respectively.
*/
#ifdef USE_ASSERT_CHECKING
for (i = 0; i < b1->ndatums; i++)
Assert((b1->datums[i][0] == b2->datums[i][0] &&
b1->datums[i][1] == b2->datums[i][1]));
@@ -891,15 +889,7 @@ partition_bounds_equal(int partnatts, int16 *parttyplen, bool *parttypbyval,
parttypbyval[j], parttyplen[j]))
return false;
}
if (b1->indexes[i] != b2->indexes[i])
return false;
}
/* There are ndatums+1 indexes in case of range partitions */
if (b1->strategy == PARTITION_STRATEGY_RANGE &&
b1->indexes[i] != b2->indexes[i])
return false;
}
return true;
}
@@ -920,8 +910,8 @@ partition_bounds_copy(PartitionBoundInfo src,
PartitionBoundInfo dest;
int i;
int ndatums;
int nindexes;
int partnatts;
int num_indexes;
bool hash_part;
int natts;
@@ -929,10 +919,9 @@ partition_bounds_copy(PartitionBoundInfo src,
dest->strategy = src->strategy;
ndatums = dest->ndatums = src->ndatums;
nindexes = dest->nindexes = src->nindexes;
partnatts = key->partnatts;
num_indexes = get_partition_bound_num_indexes(src);
/* List partitioned tables have only a single partition key. */
Assert(key->strategy != PARTITION_STRATEGY_LIST || partnatts == 1);
@@ -990,8 +979,8 @@ partition_bounds_copy(PartitionBoundInfo src,
}
}
dest->indexes = (int *) palloc(sizeof(int) * num_indexes);
memcpy(dest->indexes, src->indexes, sizeof(int) * num_indexes);
dest->indexes = (int *) palloc(sizeof(int) * nindexes);
memcpy(dest->indexes, src->indexes, sizeof(int) * nindexes);
dest->null_index = src->null_index;
dest->default_index = src->default_index;
@@ -2456,6 +2445,7 @@ build_merged_partition_bounds(char strategy, List *merged_datums,
}
Assert(list_length(merged_indexes) == ndatums);
merged_bounds->nindexes = ndatums;
merged_bounds->indexes = (int *) palloc(sizeof(int) * ndatums);
pos = 0;
foreach(lc, merged_indexes)
@@ -2889,7 +2879,7 @@ check_new_partition_bound(char *relname, Relation parent,
(errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
errmsg("every hash partition modulus must be a factor of the next larger modulus")));
greatest_modulus = get_hash_partition_greatest_modulus(boundinfo);
greatest_modulus = boundinfo->nindexes;
remainder = spec->remainder;
/*
@@ -3282,18 +3272,15 @@ check_default_partition_contents(Relation parent, Relation default_rel,
/*
* get_hash_partition_greatest_modulus
*
* Returns the greatest modulus of the hash partition bound. The greatest
* modulus will be at the end of the datums array because hash partitions are
* arranged in the ascending order of their moduli and remainders.
* Returns the greatest modulus of the hash partition bound.
* This is no longer used in the core code, but we keep it around
* in case external modules are using it.
*/
int
get_hash_partition_greatest_modulus(PartitionBoundInfo bound)
{
Assert(bound && bound->strategy == PARTITION_STRATEGY_HASH);
Assert(bound->datums && bound->ndatums > 0);
Assert(DatumGetInt32(bound->datums[bound->ndatums - 1][0]) > 0);
return DatumGetInt32(bound->datums[bound->ndatums - 1][0]);
return bound->nindexes;
}
/*
@@ -3697,46 +3684,6 @@ qsort_partition_rbound_cmp(const void *a, const void *b, void *arg)
b1, b2);
}
/*
* get_partition_bound_num_indexes
*
* Returns the number of the entries in the partition bound indexes array.
*/
static int
get_partition_bound_num_indexes(PartitionBoundInfo bound)
{
int num_indexes;
Assert(bound);
switch (bound->strategy)
{
case PARTITION_STRATEGY_HASH:
/*
* The number of the entries in the indexes array is same as the
* greatest modulus.
*/
num_indexes = get_hash_partition_greatest_modulus(bound);
break;
case PARTITION_STRATEGY_LIST:
num_indexes = bound->ndatums;
break;
case PARTITION_STRATEGY_RANGE:
/* Range partitioned table has an extra index. */
num_indexes = bound->ndatums + 1;
break;
default:
elog(ERROR, "unexpected partition strategy: %d",
(int) bound->strategy);
}
return num_indexes;
}
/*
* get_partition_operator
*

View File

@@ -781,7 +781,10 @@ get_matching_partitions(PartitionPruneContext *context, List *pruning_steps)
scan_default = final_result->scan_default;
while ((i = bms_next_member(final_result->bound_offsets, i)) >= 0)
{
int partindex = context->boundinfo->indexes[i];
int partindex;
Assert(i < context->boundinfo->nindexes);
partindex = context->boundinfo->indexes[i];
if (partindex < 0)
{
@@ -2514,20 +2517,19 @@ get_matching_hash_bounds(PartitionPruneContext *context,
for (i = 0; i < partnatts; i++)
isnull[i] = bms_is_member(i, nullkeys);
greatest_modulus = get_hash_partition_greatest_modulus(boundinfo);
rowHash = compute_partition_hash_value(partnatts, partsupfunc, partcollation,
values, isnull);
greatest_modulus = boundinfo->nindexes;
if (partindices[rowHash % greatest_modulus] >= 0)
result->bound_offsets =
bms_make_singleton(rowHash % greatest_modulus);
}
else
{
/* Getting here means at least one hash partition exists. */
Assert(boundinfo->ndatums > 0);
/* Report all valid offsets into the boundinfo->indexes array. */
result->bound_offsets = bms_add_range(NULL, 0,
boundinfo->ndatums - 1);
boundinfo->nindexes - 1);
}
/*
@@ -3388,30 +3390,20 @@ perform_pruning_combine_step(PartitionPruneContext *context,
PartitionPruneStepCombine *cstep,
PruneStepResult **step_results)
{
ListCell *lc1;
PruneStepResult *result = NULL;
PruneStepResult *result = (PruneStepResult *) palloc0(sizeof(PruneStepResult));
bool firststep;
ListCell *lc1;
/*
* A combine step without any source steps is an indication to not perform
* any partition pruning. Return all datum indexes in that case.
*/
result = (PruneStepResult *) palloc0(sizeof(PruneStepResult));
if (list_length(cstep->source_stepids) == 0)
if (cstep->source_stepids == NIL)
{
PartitionBoundInfo boundinfo = context->boundinfo;
int rangemax;
/*
* Add all valid offsets into the boundinfo->indexes array. For range
* partitioning, boundinfo->indexes contains (boundinfo->ndatums + 1)
* valid entries; otherwise there are boundinfo->ndatums.
*/
rangemax = context->strategy == PARTITION_STRATEGY_RANGE ?
boundinfo->ndatums : boundinfo->ndatums - 1;
result->bound_offsets =
bms_add_range(result->bound_offsets, 0, rangemax);
bms_add_range(NULL, 0, boundinfo->nindexes - 1);
result->scan_default = partition_bound_has_default(boundinfo);
result->scan_null = partition_bound_accepts_nulls(boundinfo);
return result;