1
0
mirror of https://github.com/postgres/postgres.git synced 2025-06-13 07:41:39 +03:00

Add word_similarity to pg_trgm contrib module.

Patch introduces a concept of similarity over string and just a word from
another string.

Version of extension is not changed because 1.2 was already introduced in 9.6
release cycle, so, there wasn't a public version.

Author: Alexander Korotkov, Artur Zakirov
This commit is contained in:
Teodor Sigaev
2016-03-16 18:59:21 +03:00
parent 1c4f001b79
commit f576b17cd6
10 changed files with 726 additions and 75 deletions

View File

@ -191,6 +191,7 @@ gtrgm_consistent(PG_FUNCTION_ARGS)
bool res;
Size querysize = VARSIZE(query);
gtrgm_consistent_cache *cache;
double nlimit;
/*
* We keep the extracted trigrams in cache, because trigram extraction is
@ -218,6 +219,7 @@ gtrgm_consistent(PG_FUNCTION_ARGS)
switch (strategy)
{
case SimilarityStrategyNumber:
case WordSimilarityStrategyNumber:
qtrg = generate_trgm(VARDATA(query),
querysize - VARHDRSZ);
break;
@ -286,16 +288,23 @@ gtrgm_consistent(PG_FUNCTION_ARGS)
switch (strategy)
{
case SimilarityStrategyNumber:
/* Similarity search is exact */
*recheck = false;
case WordSimilarityStrategyNumber:
/* Similarity search is exact. Word similarity search is inexact */
*recheck = (strategy == WordSimilarityStrategyNumber);
nlimit = (strategy == SimilarityStrategyNumber) ?
similarity_threshold : word_similarity_threshold;
if (GIST_LEAF(entry))
{ /* all leafs contains orig trgm */
float4 tmpsml = cnt_sml(key, qtrg);
/*
* Prevent gcc optimizing the tmpsml variable using volatile
* keyword. Otherwise comparison of nlimit and tmpsml may give
* wrong results.
*/
float4 volatile tmpsml = cnt_sml(qtrg, key, *recheck);
/* strange bug at freebsd 5.2.1 and gcc 3.3.3 */
res = (*(int *) &tmpsml == *(int *) &similarity_threshold
|| tmpsml > similarity_threshold) ? true : false;
res = (*(int *) &tmpsml == *(int *) &nlimit || tmpsml > nlimit);
}
else if (ISALLTRUE(key))
{ /* non-leaf contains signature */
@ -309,8 +318,7 @@ gtrgm_consistent(PG_FUNCTION_ARGS)
if (len == 0)
res = false;
else
res = (((((float8) count) / ((float8) len))) >= similarity_threshold)
? true : false;
res = (((((float8) count) / ((float8) len))) >= nlimit);
}
break;
case ILikeStrategyNumber:
@ -428,6 +436,7 @@ gtrgm_distance(PG_FUNCTION_ARGS)
StrategyNumber strategy = (StrategyNumber) PG_GETARG_UINT16(2);
/* Oid subtype = PG_GETARG_OID(3); */
bool *recheck = (bool *) PG_GETARG_POINTER(4);
TRGM *key = (TRGM *) DatumGetPointer(entry->key);
TRGM *qtrg;
float8 res;
@ -463,9 +472,17 @@ gtrgm_distance(PG_FUNCTION_ARGS)
switch (strategy)
{
case DistanceStrategyNumber:
case WordDistanceStrategyNumber:
*recheck = strategy == WordDistanceStrategyNumber;
if (GIST_LEAF(entry))
{ /* all leafs contains orig trgm */
res = 1.0 - cnt_sml(key, qtrg);
/*
* Prevent gcc optimizing the sml variable using volatile
* keyword. Otherwise res can differ from the
* word_similarity_dist_op() function.
*/
float4 volatile sml = cnt_sml(qtrg, key, *recheck);
res = 1.0 - sml;
}
else if (ISALLTRUE(key))
{ /* all leafs contains orig trgm */