1
0
mirror of https://github.com/postgres/postgres.git synced 2025-07-11 10:01:57 +03:00

Add support for nearest-neighbor (KNN) searches to SP-GiST

Currently, KNN searches were supported only by GiST.  SP-GiST also capable to
support them.  This commit implements that support.  SP-GiST scan stack is
replaced with queue, which serves as stack if no ordering is specified.  KNN
support is provided for three SP-GIST opclasses: quad_point_ops, kd_point_ops
and poly_ops (catversion is bumped).  Some common parts between GiST and SP-GiST
KNNs are extracted into separate functions.

Discussion: https://postgr.es/m/570825e8-47d0-4732-2bf6-88d67d2d51c8%40postgrespro.ru
Author: Nikita Glukhov, Alexander Korotkov based on GSoC work by Vlad Sterzhanov
Review: Andrey Borodin, Alexander Korotkov
This commit is contained in:
Alexander Korotkov
2018-09-19 01:54:10 +03:00
parent d0cfc3d6a4
commit 2a6368343f
29 changed files with 1681 additions and 428 deletions

View File

@ -14,6 +14,7 @@ include $(top_builddir)/src/Makefile.global
OBJS = spgutils.o spginsert.o spgscan.o spgvacuum.o spgvalidate.o \
spgdoinsert.o spgxlog.o \
spgtextproc.o spgquadtreeproc.o spgkdtreeproc.o
spgtextproc.o spgquadtreeproc.o spgkdtreeproc.o \
spgproc.o
include $(top_srcdir)/src/backend/common.mk

View File

@ -41,7 +41,11 @@ contain exactly one inner tuple.
When the search traversal algorithm reaches an inner tuple, it chooses a set
of nodes to continue tree traverse in depth. If it reaches a leaf page it
scans a list of leaf tuples to find the ones that match the query.
scans a list of leaf tuples to find the ones that match the query. SP-GiST
also supports ordered (nearest-neighbor) searches - that is during scan pending
nodes are put into priority queue, so traversal is performed by the
closest-first model.
The insertion algorithm descends the tree similarly, except it must choose
just one node to descend to from each inner tuple. Insertion might also have

View File

@ -16,9 +16,11 @@
#include "postgres.h"
#include "access/spgist.h"
#include "access/spgist_private.h"
#include "access/stratnum.h"
#include "catalog/pg_type.h"
#include "utils/builtins.h"
#include "utils/float.h"
#include "utils/geo_decls.h"
@ -162,6 +164,7 @@ spg_kd_inner_consistent(PG_FUNCTION_ARGS)
double coord;
int which;
int i;
BOX bboxes[2];
Assert(in->hasPrefix);
coord = DatumGetFloat8(in->prefixDatum);
@ -248,12 +251,87 @@ spg_kd_inner_consistent(PG_FUNCTION_ARGS)
}
/* We must descend into the children identified by which */
out->nodeNumbers = (int *) palloc(sizeof(int) * 2);
out->nNodes = 0;
/* Fast-path for no matching children */
if (!which)
PG_RETURN_VOID();
out->nodeNumbers = (int *) palloc(sizeof(int) * 2);
/*
* When ordering scan keys are specified, we've to calculate distance for
* them. In order to do that, we need calculate bounding boxes for both
* children nodes. Calculation of those bounding boxes on non-zero level
* require knowledge of bounding box of upper node. So, we save bounding
* boxes to traversalValues.
*/
if (in->norderbys > 0)
{
BOX infArea;
BOX *area;
out->distances = (double **) palloc(sizeof(double *) * in->nNodes);
out->traversalValues = (void **) palloc(sizeof(void *) * in->nNodes);
if (in->level == 0)
{
float8 inf = get_float8_infinity();
infArea.high.x = inf;
infArea.high.y = inf;
infArea.low.x = -inf;
infArea.low.y = -inf;
area = &infArea;
}
else
{
area = (BOX *) in->traversalValue;
Assert(area);
}
bboxes[0].low = area->low;
bboxes[1].high = area->high;
if (in->level % 2)
{
/* split box by x */
bboxes[0].high.x = bboxes[1].low.x = coord;
bboxes[0].high.y = area->high.y;
bboxes[1].low.y = area->low.y;
}
else
{
/* split box by y */
bboxes[0].high.y = bboxes[1].low.y = coord;
bboxes[0].high.x = area->high.x;
bboxes[1].low.x = area->low.x;
}
}
for (i = 1; i <= 2; i++)
{
if (which & (1 << i))
out->nodeNumbers[out->nNodes++] = i - 1;
{
out->nodeNumbers[out->nNodes] = i - 1;
if (in->norderbys > 0)
{
MemoryContext oldCtx = MemoryContextSwitchTo(
in->traversalMemoryContext);
BOX *box = box_copy(&bboxes[i - 1]);
MemoryContextSwitchTo(oldCtx);
out->traversalValues[out->nNodes] = box;
out->distances[out->nNodes] = spg_key_orderbys_distances(
BoxPGetDatum(box), false,
in->orderbys, in->norderbys);
}
out->nNodes++;
}
}
/* Set up level increments, too */

View File

@ -0,0 +1,88 @@
/*-------------------------------------------------------------------------
*
* spgproc.c
* Common supporting procedures for SP-GiST opclasses.
*
*
* Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
* src/backend/access/spgist/spgproc.c
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include <math.h>
#include "access/spgist_private.h"
#include "utils/builtins.h"
#include "utils/float.h"
#include "utils/geo_decls.h"
#define point_point_distance(p1,p2) \
DatumGetFloat8(DirectFunctionCall2(point_distance, \
PointPGetDatum(p1), PointPGetDatum(p2)))
/* Point-box distance in the assumption that box is aligned by axis */
static double
point_box_distance(Point *point, BOX *box)
{
double dx,
dy;
if (isnan(point->x) || isnan(box->low.x) ||
isnan(point->y) || isnan(box->low.y))
return get_float8_nan();
if (point->x < box->low.x)
dx = box->low.x - point->x;
else if (point->x > box->high.x)
dx = point->x - box->high.x;
else
dx = 0.0;
if (point->y < box->low.y)
dy = box->low.y - point->y;
else if (point->y > box->high.y)
dy = point->y - box->high.y;
else
dy = 0.0;
return HYPOT(dx, dy);
}
/*
* Returns distances from given key to array of ordering scan keys. Leaf key
* is expected to be point, non-leaf key is expected to be box. Scan key
* arguments are expected to be points.
*/
double *
spg_key_orderbys_distances(Datum key, bool isLeaf,
ScanKey orderbys, int norderbys)
{
int sk_num;
double *distances = (double *) palloc(norderbys * sizeof(double)),
*distance = distances;
for (sk_num = 0; sk_num < norderbys; ++sk_num, ++orderbys, ++distance)
{
Point *point = DatumGetPointP(orderbys->sk_argument);
*distance = isLeaf ? point_point_distance(point, DatumGetPointP(key))
: point_box_distance(point, DatumGetBoxP(key));
}
return distances;
}
BOX *
box_copy(BOX *orig)
{
BOX *result = palloc(sizeof(BOX));
*result = *orig;
return result;
}

View File

@ -17,8 +17,10 @@
#include "access/spgist.h"
#include "access/stratnum.h"
#include "access/spgist_private.h"
#include "catalog/pg_type.h"
#include "utils/builtins.h"
#include "utils/float.h"
#include "utils/geo_decls.h"
@ -77,6 +79,38 @@ getQuadrant(Point *centroid, Point *tst)
return 0;
}
/* Returns bounding box of a given quadrant inside given bounding box */
static BOX *
getQuadrantArea(BOX *bbox, Point *centroid, int quadrant)
{
BOX *result = (BOX *) palloc(sizeof(BOX));
switch (quadrant)
{
case 1:
result->high = bbox->high;
result->low = *centroid;
break;
case 2:
result->high.x = bbox->high.x;
result->high.y = centroid->y;
result->low.x = centroid->x;
result->low.y = bbox->low.y;
break;
case 3:
result->high = *centroid;
result->low = bbox->low;
break;
case 4:
result->high.x = centroid->x;
result->high.y = bbox->high.y;
result->low.x = bbox->low.x;
result->low.y = centroid->y;
break;
}
return result;
}
Datum
spg_quad_choose(PG_FUNCTION_ARGS)
@ -196,19 +230,68 @@ spg_quad_inner_consistent(PG_FUNCTION_ARGS)
spgInnerConsistentIn *in = (spgInnerConsistentIn *) PG_GETARG_POINTER(0);
spgInnerConsistentOut *out = (spgInnerConsistentOut *) PG_GETARG_POINTER(1);
Point *centroid;
BOX infbbox;
BOX *bbox = NULL;
int which;
int i;
Assert(in->hasPrefix);
centroid = DatumGetPointP(in->prefixDatum);
/*
* When ordering scan keys are specified, we've to calculate distance for
* them. In order to do that, we need calculate bounding boxes for all
* children nodes. Calculation of those bounding boxes on non-zero level
* require knowledge of bounding box of upper node. So, we save bounding
* boxes to traversalValues.
*/
if (in->norderbys > 0)
{
out->distances = (double **) palloc(sizeof(double *) * in->nNodes);
out->traversalValues = (void **) palloc(sizeof(void *) * in->nNodes);
if (in->level == 0)
{
double inf = get_float8_infinity();
infbbox.high.x = inf;
infbbox.high.y = inf;
infbbox.low.x = -inf;
infbbox.low.y = -inf;
bbox = &infbbox;
}
else
{
bbox = in->traversalValue;
Assert(bbox);
}
}
if (in->allTheSame)
{
/* Report that all nodes should be visited */
out->nNodes = in->nNodes;
out->nodeNumbers = (int *) palloc(sizeof(int) * in->nNodes);
for (i = 0; i < in->nNodes; i++)
{
out->nodeNumbers[i] = i;
if (in->norderbys > 0)
{
MemoryContext oldCtx = MemoryContextSwitchTo(
in->traversalMemoryContext);
/* Use parent quadrant box as traversalValue */
BOX *quadrant = box_copy(bbox);
MemoryContextSwitchTo(oldCtx);
out->traversalValues[i] = quadrant;
out->distances[i] = spg_key_orderbys_distances(
BoxPGetDatum(quadrant), false,
in->orderbys, in->norderbys);
}
}
PG_RETURN_VOID();
}
@ -286,13 +369,37 @@ spg_quad_inner_consistent(PG_FUNCTION_ARGS)
break; /* no need to consider remaining conditions */
}
out->levelAdds = palloc(sizeof(int) * 4);
for (i = 0; i < 4; ++i)
out->levelAdds[i] = 1;
/* We must descend into the quadrant(s) identified by which */
out->nodeNumbers = (int *) palloc(sizeof(int) * 4);
out->nNodes = 0;
for (i = 1; i <= 4; i++)
{
if (which & (1 << i))
out->nodeNumbers[out->nNodes++] = i - 1;
{
out->nodeNumbers[out->nNodes] = i - 1;
if (in->norderbys > 0)
{
MemoryContext oldCtx = MemoryContextSwitchTo(
in->traversalMemoryContext);
BOX *quadrant = getQuadrantArea(bbox, centroid, i);
MemoryContextSwitchTo(oldCtx);
out->traversalValues[out->nNodes] = quadrant;
out->distances[out->nNodes] = spg_key_orderbys_distances(
BoxPGetDatum(quadrant), false,
in->orderbys, in->norderbys);
}
out->nNodes++;
}
}
PG_RETURN_VOID();
@ -356,5 +463,11 @@ spg_quad_leaf_consistent(PG_FUNCTION_ARGS)
break;
}
if (res && in->norderbys > 0)
/* ok, it passes -> let's compute the distances */
out->distances = spg_key_orderbys_distances(
BoxPGetDatum(in->leafDatum), true,
in->orderbys, in->norderbys);
PG_RETURN_BOOL(res);
}

File diff suppressed because it is too large Load Diff

View File

@ -15,17 +15,26 @@
#include "postgres.h"
#include "access/amvalidate.h"
#include "access/htup_details.h"
#include "access/reloptions.h"
#include "access/spgist_private.h"
#include "access/transam.h"
#include "access/xact.h"
#include "catalog/pg_amop.h"
#include "optimizer/paths.h"
#include "storage/bufmgr.h"
#include "storage/indexfsm.h"
#include "storage/lmgr.h"
#include "utils/builtins.h"
#include "utils/catcache.h"
#include "utils/index_selfuncs.h"
#include "utils/lsyscache.h"
#include "utils/syscache.h"
extern Expr *spgcanorderbyop(IndexOptInfo *index,
PathKey *pathkey, int pathkeyno,
Expr *orderby_clause, int *indexcol_p);
/*
* SP-GiST handler function: return IndexAmRoutine with access method parameters
@ -39,7 +48,7 @@ spghandler(PG_FUNCTION_ARGS)
amroutine->amstrategies = 0;
amroutine->amsupport = SPGISTNProc;
amroutine->amcanorder = false;
amroutine->amcanorderbyop = false;
amroutine->amcanorderbyop = true;
amroutine->amcanbackward = false;
amroutine->amcanunique = false;
amroutine->amcanmulticol = false;
@ -61,7 +70,7 @@ spghandler(PG_FUNCTION_ARGS)
amroutine->amcanreturn = spgcanreturn;
amroutine->amcostestimate = spgcostestimate;
amroutine->amoptions = spgoptions;
amroutine->amproperty = NULL;
amroutine->amproperty = spgproperty;
amroutine->amvalidate = spgvalidate;
amroutine->ambeginscan = spgbeginscan;
amroutine->amrescan = spgrescan;
@ -949,3 +958,82 @@ SpGistPageAddNewItem(SpGistState *state, Page page, Item item, Size size,
return offnum;
}
/*
* spgproperty() -- Check boolean properties of indexes.
*
* This is optional for most AMs, but is required for SP-GiST because the core
* property code doesn't support AMPROP_DISTANCE_ORDERABLE.
*/
bool
spgproperty(Oid index_oid, int attno,
IndexAMProperty prop, const char *propname,
bool *res, bool *isnull)
{
Oid opclass,
opfamily,
opcintype;
CatCList *catlist;
int i;
/* Only answer column-level inquiries */
if (attno == 0)
return false;
switch (prop)
{
case AMPROP_DISTANCE_ORDERABLE:
break;
default:
return false;
}
/*
* Currently, SP-GiST distance-ordered scans require that there be a
* distance operator in the opclass with the default types. So we assume
* that if such a operator exists, then there's a reason for it.
*/
/* First we need to know the column's opclass. */
opclass = get_index_column_opclass(index_oid, attno);
if (!OidIsValid(opclass))
{
*isnull = true;
return true;
}
/* Now look up the opclass family and input datatype. */
if (!get_opclass_opfamily_and_input_type(opclass, &opfamily, &opcintype))
{
*isnull = true;
return true;
}
/* And now we can check whether the operator is provided. */
catlist = SearchSysCacheList1(AMOPSTRATEGY,
ObjectIdGetDatum(opfamily));
*res = false;
for (i = 0; i < catlist->n_members; i++)
{
HeapTuple amoptup = &catlist->members[i]->tuple;
Form_pg_amop amopform = (Form_pg_amop) GETSTRUCT(amoptup);
if (amopform->amoppurpose == AMOP_ORDER &&
(amopform->amoplefttype == opcintype ||
amopform->amoprighttype == opcintype) &&
opfamily_can_sort_type(amopform->amopsortfamily,
get_op_rettype(amopform->amopopr)))
{
*res = true;
break;
}
}
ReleaseSysCacheList(catlist);
*isnull = false;
return true;
}

View File

@ -187,6 +187,7 @@ spgvalidate(Oid opclassoid)
{
HeapTuple oprtup = &oprlist->members[i]->tuple;
Form_pg_amop oprform = (Form_pg_amop) GETSTRUCT(oprtup);
Oid op_rettype;
/* TODO: Check that only allowed strategy numbers exist */
if (oprform->amopstrategy < 1 || oprform->amopstrategy > 63)
@ -200,20 +201,26 @@ spgvalidate(Oid opclassoid)
result = false;
}
/* spgist doesn't support ORDER BY operators */
if (oprform->amoppurpose != AMOP_SEARCH ||
OidIsValid(oprform->amopsortfamily))
/* spgist supports ORDER BY operators */
if (oprform->amoppurpose != AMOP_SEARCH)
{
ereport(INFO,
(errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
errmsg("operator family \"%s\" of access method %s contains invalid ORDER BY specification for operator %s",
opfamilyname, "spgist",
format_operator(oprform->amopopr))));
result = false;
/* ... and operator result must match the claimed btree opfamily */
op_rettype = get_op_rettype(oprform->amopopr);
if (!opfamily_can_sort_type(oprform->amopsortfamily, op_rettype))
{
ereport(INFO,
(errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
errmsg("operator family \"%s\" of access method %s contains invalid ORDER BY specification for operator %s",
opfamilyname, "spgist",
format_operator(oprform->amopopr))));
result = false;
}
}
else
op_rettype = BOOLOID;
/* Check operator signature --- same for all spgist strategies */
if (!check_amop_signature(oprform->amopopr, BOOLOID,
if (!check_amop_signature(oprform->amopopr, op_rettype,
oprform->amoplefttype,
oprform->amoprighttype))
{