mirror of
https://github.com/postgres/postgres.git
synced 2025-10-24 01:29:19 +03:00
Obviously, in order to equality operator be satisfiable, target string must contain all the trigrams of the search string. On this base, we implement equality operator in GiST/GIN indexes with recheck. Discussion: https://postgr.es/m/CAOBaU_YWwtT7tdggtROacjdOdeYHCz-tmSwuC-j-TOG-g97J0w%40mail.gmail.com Author: Julien Rouhaud Reviewed-by: Tom Lane, Alexander Korotkov, Georgios Kokolatos, Erik Rijkers
360 lines
8.8 KiB
C
360 lines
8.8 KiB
C
/*
|
|
* contrib/pg_trgm/trgm_gin.c
|
|
*/
|
|
#include "postgres.h"
|
|
|
|
#include "access/gin.h"
|
|
#include "access/stratnum.h"
|
|
#include "fmgr.h"
|
|
#include "trgm.h"
|
|
|
|
PG_FUNCTION_INFO_V1(gin_extract_trgm);
|
|
PG_FUNCTION_INFO_V1(gin_extract_value_trgm);
|
|
PG_FUNCTION_INFO_V1(gin_extract_query_trgm);
|
|
PG_FUNCTION_INFO_V1(gin_trgm_consistent);
|
|
PG_FUNCTION_INFO_V1(gin_trgm_triconsistent);
|
|
|
|
/*
|
|
* This function can only be called if a pre-9.1 version of the GIN operator
|
|
* class definition is present in the catalogs (probably as a consequence
|
|
* of upgrade-in-place). Cope.
|
|
*/
|
|
Datum
|
|
gin_extract_trgm(PG_FUNCTION_ARGS)
|
|
{
|
|
if (PG_NARGS() == 3)
|
|
return gin_extract_value_trgm(fcinfo);
|
|
if (PG_NARGS() == 7)
|
|
return gin_extract_query_trgm(fcinfo);
|
|
elog(ERROR, "unexpected number of arguments to gin_extract_trgm");
|
|
PG_RETURN_NULL();
|
|
}
|
|
|
|
Datum
|
|
gin_extract_value_trgm(PG_FUNCTION_ARGS)
|
|
{
|
|
text *val = (text *) PG_GETARG_TEXT_PP(0);
|
|
int32 *nentries = (int32 *) PG_GETARG_POINTER(1);
|
|
Datum *entries = NULL;
|
|
TRGM *trg;
|
|
int32 trglen;
|
|
|
|
*nentries = 0;
|
|
|
|
trg = generate_trgm(VARDATA_ANY(val), VARSIZE_ANY_EXHDR(val));
|
|
trglen = ARRNELEM(trg);
|
|
|
|
if (trglen > 0)
|
|
{
|
|
trgm *ptr;
|
|
int32 i;
|
|
|
|
*nentries = trglen;
|
|
entries = (Datum *) palloc(sizeof(Datum) * trglen);
|
|
|
|
ptr = GETARR(trg);
|
|
for (i = 0; i < trglen; i++)
|
|
{
|
|
int32 item = trgm2int(ptr);
|
|
|
|
entries[i] = Int32GetDatum(item);
|
|
ptr++;
|
|
}
|
|
}
|
|
|
|
PG_RETURN_POINTER(entries);
|
|
}
|
|
|
|
Datum
|
|
gin_extract_query_trgm(PG_FUNCTION_ARGS)
|
|
{
|
|
text *val = (text *) PG_GETARG_TEXT_PP(0);
|
|
int32 *nentries = (int32 *) PG_GETARG_POINTER(1);
|
|
StrategyNumber strategy = PG_GETARG_UINT16(2);
|
|
|
|
/* bool **pmatch = (bool **) PG_GETARG_POINTER(3); */
|
|
Pointer **extra_data = (Pointer **) PG_GETARG_POINTER(4);
|
|
|
|
/* bool **nullFlags = (bool **) PG_GETARG_POINTER(5); */
|
|
int32 *searchMode = (int32 *) PG_GETARG_POINTER(6);
|
|
Datum *entries = NULL;
|
|
TRGM *trg;
|
|
int32 trglen;
|
|
trgm *ptr;
|
|
TrgmPackedGraph *graph;
|
|
int32 i;
|
|
|
|
switch (strategy)
|
|
{
|
|
case SimilarityStrategyNumber:
|
|
case WordSimilarityStrategyNumber:
|
|
case StrictWordSimilarityStrategyNumber:
|
|
case EqualStrategyNumber:
|
|
trg = generate_trgm(VARDATA_ANY(val), VARSIZE_ANY_EXHDR(val));
|
|
break;
|
|
case ILikeStrategyNumber:
|
|
#ifndef IGNORECASE
|
|
elog(ERROR, "cannot handle ~~* with case-sensitive trigrams");
|
|
#endif
|
|
/* FALL THRU */
|
|
case LikeStrategyNumber:
|
|
|
|
/*
|
|
* For wildcard search we extract all the trigrams that every
|
|
* potentially-matching string must include.
|
|
*/
|
|
trg = generate_wildcard_trgm(VARDATA_ANY(val),
|
|
VARSIZE_ANY_EXHDR(val));
|
|
break;
|
|
case RegExpICaseStrategyNumber:
|
|
#ifndef IGNORECASE
|
|
elog(ERROR, "cannot handle ~* with case-sensitive trigrams");
|
|
#endif
|
|
/* FALL THRU */
|
|
case RegExpStrategyNumber:
|
|
trg = createTrgmNFA(val, PG_GET_COLLATION(),
|
|
&graph, CurrentMemoryContext);
|
|
if (trg && ARRNELEM(trg) > 0)
|
|
{
|
|
/*
|
|
* Successful regex processing: store NFA-like graph as
|
|
* extra_data. GIN API requires an array of nentries
|
|
* Pointers, but we just put the same value in each element.
|
|
*/
|
|
trglen = ARRNELEM(trg);
|
|
*extra_data = (Pointer *) palloc(sizeof(Pointer) * trglen);
|
|
for (i = 0; i < trglen; i++)
|
|
(*extra_data)[i] = (Pointer) graph;
|
|
}
|
|
else
|
|
{
|
|
/* No result: have to do full index scan. */
|
|
*nentries = 0;
|
|
*searchMode = GIN_SEARCH_MODE_ALL;
|
|
PG_RETURN_POINTER(entries);
|
|
}
|
|
break;
|
|
default:
|
|
elog(ERROR, "unrecognized strategy number: %d", strategy);
|
|
trg = NULL; /* keep compiler quiet */
|
|
break;
|
|
}
|
|
|
|
trglen = ARRNELEM(trg);
|
|
*nentries = trglen;
|
|
|
|
if (trglen > 0)
|
|
{
|
|
entries = (Datum *) palloc(sizeof(Datum) * trglen);
|
|
ptr = GETARR(trg);
|
|
for (i = 0; i < trglen; i++)
|
|
{
|
|
int32 item = trgm2int(ptr);
|
|
|
|
entries[i] = Int32GetDatum(item);
|
|
ptr++;
|
|
}
|
|
}
|
|
|
|
/*
|
|
* If no trigram was extracted then we have to scan all the index.
|
|
*/
|
|
if (trglen == 0)
|
|
*searchMode = GIN_SEARCH_MODE_ALL;
|
|
|
|
PG_RETURN_POINTER(entries);
|
|
}
|
|
|
|
Datum
|
|
gin_trgm_consistent(PG_FUNCTION_ARGS)
|
|
{
|
|
bool *check = (bool *) PG_GETARG_POINTER(0);
|
|
StrategyNumber strategy = PG_GETARG_UINT16(1);
|
|
|
|
/* text *query = PG_GETARG_TEXT_PP(2); */
|
|
int32 nkeys = PG_GETARG_INT32(3);
|
|
Pointer *extra_data = (Pointer *) PG_GETARG_POINTER(4);
|
|
bool *recheck = (bool *) PG_GETARG_POINTER(5);
|
|
bool res;
|
|
int32 i,
|
|
ntrue;
|
|
double nlimit;
|
|
|
|
/* All cases served by this function are inexact */
|
|
*recheck = true;
|
|
|
|
switch (strategy)
|
|
{
|
|
case SimilarityStrategyNumber:
|
|
case WordSimilarityStrategyNumber:
|
|
case StrictWordSimilarityStrategyNumber:
|
|
nlimit = index_strategy_get_limit(strategy);
|
|
|
|
/* Count the matches */
|
|
ntrue = 0;
|
|
for (i = 0; i < nkeys; i++)
|
|
{
|
|
if (check[i])
|
|
ntrue++;
|
|
}
|
|
|
|
/*--------------------
|
|
* If DIVUNION is defined then similarity formula is:
|
|
* c / (len1 + len2 - c)
|
|
* where c is number of common trigrams and it stands as ntrue in
|
|
* this code. Here we don't know value of len2 but we can assume
|
|
* that c (ntrue) is a lower bound of len2, so upper bound of
|
|
* similarity is:
|
|
* c / (len1 + c - c) => c / len1
|
|
* If DIVUNION is not defined then similarity formula is:
|
|
* c / max(len1, len2)
|
|
* And again, c (ntrue) is a lower bound of len2, but c <= len1
|
|
* just by definition and, consequently, upper bound of
|
|
* similarity is just c / len1.
|
|
* So, independently on DIVUNION the upper bound formula is the same.
|
|
*/
|
|
res = (nkeys == 0) ? false :
|
|
(((((float4) ntrue) / ((float4) nkeys))) >= nlimit);
|
|
break;
|
|
case ILikeStrategyNumber:
|
|
#ifndef IGNORECASE
|
|
elog(ERROR, "cannot handle ~~* with case-sensitive trigrams");
|
|
#endif
|
|
/* FALL THRU */
|
|
case LikeStrategyNumber:
|
|
case EqualStrategyNumber:
|
|
/* Check if all extracted trigrams are presented. */
|
|
res = true;
|
|
for (i = 0; i < nkeys; i++)
|
|
{
|
|
if (!check[i])
|
|
{
|
|
res = false;
|
|
break;
|
|
}
|
|
}
|
|
break;
|
|
case RegExpICaseStrategyNumber:
|
|
#ifndef IGNORECASE
|
|
elog(ERROR, "cannot handle ~* with case-sensitive trigrams");
|
|
#endif
|
|
/* FALL THRU */
|
|
case RegExpStrategyNumber:
|
|
if (nkeys < 1)
|
|
{
|
|
/* Regex processing gave no result: do full index scan */
|
|
res = true;
|
|
}
|
|
else
|
|
res = trigramsMatchGraph((TrgmPackedGraph *) extra_data[0],
|
|
check);
|
|
break;
|
|
default:
|
|
elog(ERROR, "unrecognized strategy number: %d", strategy);
|
|
res = false; /* keep compiler quiet */
|
|
break;
|
|
}
|
|
|
|
PG_RETURN_BOOL(res);
|
|
}
|
|
|
|
/*
|
|
* In all cases, GIN_TRUE is at least as favorable to inclusion as
|
|
* GIN_MAYBE. If no better option is available, simply treat
|
|
* GIN_MAYBE as if it were GIN_TRUE and apply the same test as the binary
|
|
* consistent function.
|
|
*/
|
|
Datum
|
|
gin_trgm_triconsistent(PG_FUNCTION_ARGS)
|
|
{
|
|
GinTernaryValue *check = (GinTernaryValue *) PG_GETARG_POINTER(0);
|
|
StrategyNumber strategy = PG_GETARG_UINT16(1);
|
|
|
|
/* text *query = PG_GETARG_TEXT_PP(2); */
|
|
int32 nkeys = PG_GETARG_INT32(3);
|
|
Pointer *extra_data = (Pointer *) PG_GETARG_POINTER(4);
|
|
GinTernaryValue res = GIN_MAYBE;
|
|
int32 i,
|
|
ntrue;
|
|
bool *boolcheck;
|
|
double nlimit;
|
|
|
|
switch (strategy)
|
|
{
|
|
case SimilarityStrategyNumber:
|
|
case WordSimilarityStrategyNumber:
|
|
case StrictWordSimilarityStrategyNumber:
|
|
nlimit = index_strategy_get_limit(strategy);
|
|
|
|
/* Count the matches */
|
|
ntrue = 0;
|
|
for (i = 0; i < nkeys; i++)
|
|
{
|
|
if (check[i] != GIN_FALSE)
|
|
ntrue++;
|
|
}
|
|
|
|
/*
|
|
* See comment in gin_trgm_consistent() about * upper bound
|
|
* formula
|
|
*/
|
|
res = (nkeys == 0)
|
|
? GIN_FALSE : (((((float4) ntrue) / ((float4) nkeys)) >= nlimit)
|
|
? GIN_MAYBE : GIN_FALSE);
|
|
break;
|
|
case ILikeStrategyNumber:
|
|
#ifndef IGNORECASE
|
|
elog(ERROR, "cannot handle ~~* with case-sensitive trigrams");
|
|
#endif
|
|
/* FALL THRU */
|
|
case LikeStrategyNumber:
|
|
case EqualStrategyNumber:
|
|
/* Check if all extracted trigrams are presented. */
|
|
res = GIN_MAYBE;
|
|
for (i = 0; i < nkeys; i++)
|
|
{
|
|
if (check[i] == GIN_FALSE)
|
|
{
|
|
res = GIN_FALSE;
|
|
break;
|
|
}
|
|
}
|
|
break;
|
|
case RegExpICaseStrategyNumber:
|
|
#ifndef IGNORECASE
|
|
elog(ERROR, "cannot handle ~* with case-sensitive trigrams");
|
|
#endif
|
|
/* FALL THRU */
|
|
case RegExpStrategyNumber:
|
|
if (nkeys < 1)
|
|
{
|
|
/* Regex processing gave no result: do full index scan */
|
|
res = GIN_MAYBE;
|
|
}
|
|
else
|
|
{
|
|
/*
|
|
* As trigramsMatchGraph implements a monotonic boolean
|
|
* function, promoting all GIN_MAYBE keys to GIN_TRUE will
|
|
* give a conservative result.
|
|
*/
|
|
boolcheck = (bool *) palloc(sizeof(bool) * nkeys);
|
|
for (i = 0; i < nkeys; i++)
|
|
boolcheck[i] = (check[i] != GIN_FALSE);
|
|
if (!trigramsMatchGraph((TrgmPackedGraph *) extra_data[0],
|
|
boolcheck))
|
|
res = GIN_FALSE;
|
|
pfree(boolcheck);
|
|
}
|
|
break;
|
|
default:
|
|
elog(ERROR, "unrecognized strategy number: %d", strategy);
|
|
res = GIN_FALSE; /* keep compiler quiet */
|
|
break;
|
|
}
|
|
|
|
/* All cases served by this function are inexact */
|
|
Assert(res != GIN_TRUE);
|
|
PG_RETURN_GIN_TERNARY_VALUE(res);
|
|
}
|