Add word_similarity to pg_trgm contrib module.

Patch introduces a concept of similarity over string and just a word from another string. Version of extension is not changed because 1.2 was already introduced in 9.6 release cycle, so, there wasn't a public version. Author: Alexander Korotkov, Artur Zakirov
2025-12-06 00:02:13 +03:00 · 2016-03-16 18:59:21 +03:00
parent 1c4f001b79
commit f576b17cd6
10 changed files with 726 additions and 75 deletions
--- a/contrib/pg_trgm/Makefile
+++ b/contrib/pg_trgm/Makefile
@@ -7,7 +7,7 @@ EXTENSION = pg_trgm
 DATA = pg_trgm--1.2.sql pg_trgm--1.0--1.1.sql pg_trgm--1.1--1.2.sql pg_trgm--unpackaged--1.0.sql
 PGFILEDESC = "pg_trgm - trigram matching"
-REGRESS = pg_trgm
+REGRESS = pg_trgm pg_word_trgm
 ifdef USE_PGXS
 PG_CONFIG = pg_config
--- a/contrib/pg_trgm/expected/pg_trgm.out
+++ b/contrib/pg_trgm/expected/pg_trgm.out
@@ -59,7 +59,7 @@ select similarity('---', '####---');
          0
 (1 row)
-CREATE TABLE test_trgm(t text);
+CREATE TABLE test_trgm(t text COLLATE "C");
 \copy test_trgm from 'data/trgm.data'
 select t,similarity(t,'qwertyu0988') as sml from test_trgm where t % 'qwertyu0988' order by sml desc, t;
      t      |   sml    
@@ -3467,7 +3467,7 @@ select t,similarity(t,'gwertyu1988') as sml from test_trgm where t % 'gwertyu198
 qwertyu0988 | 0.333333
 (1 row)
-create table test2(t text);
+create table test2(t text COLLATE "C");
 insert into test2 values ('abcdef');
 insert into test2 values ('quark');
 insert into test2 values ('  z foo bar');
--- a/contrib/pg_trgm/pg_trgm--1.1--1.2.sql
+++ b/contrib/pg_trgm/pg_trgm--1.1--1.2.sql
@@ -3,10 +3,72 @@
 -- complain if script is sourced in psql, rather than via ALTER EXTENSION
 \echo Use "ALTER EXTENSION pg_trgm UPDATE TO '1.2'" to load this file. \quit
 CREATE FUNCTION word_similarity(text,text)
 RETURNS float4
 AS 'MODULE_PATHNAME'
 LANGUAGE C STRICT IMMUTABLE;
 CREATE FUNCTION word_similarity_op(text,text)
 RETURNS bool
 AS 'MODULE_PATHNAME'
 LANGUAGE C STRICT STABLE;  -- stable because depends on pg_trgm.word_similarity_threshold
 CREATE FUNCTION word_similarity_commutator_op(text,text)
 RETURNS bool
 AS 'MODULE_PATHNAME'
 LANGUAGE C STRICT STABLE;  -- stable because depends on pg_trgm.word_similarity_threshold
 CREATE FUNCTION word_similarity_dist_op(text,text)
 RETURNS float4
 AS 'MODULE_PATHNAME'
 LANGUAGE C STRICT IMMUTABLE;
 CREATE FUNCTION word_similarity_dist_commutator_op(text,text)
 RETURNS float4
 AS 'MODULE_PATHNAME'
 LANGUAGE C STRICT IMMUTABLE;
 CREATE OPERATOR <% (
        LEFTARG = text,
        RIGHTARG = text,
        PROCEDURE = word_similarity_op,
        COMMUTATOR = '%>',
        RESTRICT = contsel,
        JOIN = contjoinsel
 );
 CREATE OPERATOR %> (
        LEFTARG = text,
        RIGHTARG = text,
        PROCEDURE = word_similarity_commutator_op,
        COMMUTATOR = '<%',
        RESTRICT = contsel,
        JOIN = contjoinsel
 );
 CREATE OPERATOR <<-> (
        LEFTARG = text,
        RIGHTARG = text,
        PROCEDURE = word_similarity_dist_op,
        COMMUTATOR = '<->>'
 );
 CREATE OPERATOR <->> (
        LEFTARG = text,
        RIGHTARG = text,
        PROCEDURE = word_similarity_dist_commutator_op,
        COMMUTATOR = '<<->'
 );
 CREATE FUNCTION gin_trgm_triconsistent(internal, int2, text, int4, internal, internal, internal)
 RETURNS "char"
 AS 'MODULE_PATHNAME'
 LANGUAGE C IMMUTABLE STRICT;
 ALTER OPERATOR FAMILY gist_trgm_ops USING gist ADD
        OPERATOR        7       %> (text, text),
        OPERATOR        8       <->> (text, text) FOR ORDER BY pg_catalog.float_ops;
 ALTER OPERATOR FAMILY gin_trgm_ops USING gin ADD
-        FUNCTION        6    (text, text)   gin_trgm_triconsistent (internal, int2, text, int4, internal, internal, internal);
+        OPERATOR        7       %> (text, text),
        FUNCTION        6      (text, text)   gin_trgm_triconsistent (internal, int2, text, int4, internal, internal, internal);
--- a/contrib/pg_trgm/pg_trgm--1.2.sql
+++ b/contrib/pg_trgm/pg_trgm--1.2.sql
@@ -39,6 +39,39 @@ CREATE OPERATOR % (
        JOIN = contjoinsel
 );
 CREATE FUNCTION word_similarity(text,text)
 RETURNS float4
 AS 'MODULE_PATHNAME'
 LANGUAGE C STRICT IMMUTABLE;
 CREATE FUNCTION word_similarity_op(text,text)
 RETURNS bool
 AS 'MODULE_PATHNAME'
 LANGUAGE C STRICT STABLE;  -- stable because depends on pg_trgm.word_similarity_threshold
 CREATE FUNCTION word_similarity_commutator_op(text,text)
 RETURNS bool
 AS 'MODULE_PATHNAME'
 LANGUAGE C STRICT STABLE;  -- stable because depends on pg_trgm.word_similarity_threshold
 CREATE OPERATOR <% (
        LEFTARG = text,
        RIGHTARG = text,
        PROCEDURE = word_similarity_op,
        COMMUTATOR = '%>',
        RESTRICT = contsel,
        JOIN = contjoinsel
 );
 CREATE OPERATOR %> (
        LEFTARG = text,
        RIGHTARG = text,
        PROCEDURE = word_similarity_commutator_op,
        COMMUTATOR = '<%',
        RESTRICT = contsel,
        JOIN = contjoinsel
 );
 CREATE FUNCTION similarity_dist(text,text)
 RETURNS float4
 AS 'MODULE_PATHNAME'
@@ -51,6 +84,30 @@ CREATE OPERATOR <-> (
        COMMUTATOR = '<->'
 );
 CREATE FUNCTION word_similarity_dist_op(text,text)
 RETURNS float4
 AS 'MODULE_PATHNAME'
 LANGUAGE C STRICT IMMUTABLE;
 CREATE FUNCTION word_similarity_dist_commutator_op(text,text)
 RETURNS float4
 AS 'MODULE_PATHNAME'
 LANGUAGE C STRICT IMMUTABLE;
 CREATE OPERATOR <<-> (
        LEFTARG = text,
        RIGHTARG = text,
        PROCEDURE = word_similarity_dist_op,
        COMMUTATOR = '<->>'
 );
 CREATE OPERATOR <->> (
        LEFTARG = text,
        RIGHTARG = text,
        PROCEDURE = word_similarity_dist_commutator_op,
        COMMUTATOR = '<<->'
 );
 -- gist key
 CREATE FUNCTION gtrgm_in(cstring)
 RETURNS gtrgm
@@ -140,6 +197,12 @@ ALTER OPERATOR FAMILY gist_trgm_ops USING gist ADD
        OPERATOR        5       pg_catalog.~ (text, text),
        OPERATOR        6       pg_catalog.~* (text, text);
 -- Add operators that are new in 9.6 (pg_trgm 1.2).
 ALTER OPERATOR FAMILY gist_trgm_ops USING gist ADD
        OPERATOR        7       %> (text, text),
        OPERATOR        8       <->> (text, text) FOR ORDER BY pg_catalog.float_ops;
 -- support functions for gin
 CREATE FUNCTION gin_extract_value_trgm(text, internal)
 RETURNS internal
@@ -187,4 +250,5 @@ AS 'MODULE_PATHNAME'
 LANGUAGE C IMMUTABLE STRICT;
 ALTER OPERATOR FAMILY gin_trgm_ops USING gin ADD
        OPERATOR        7       %> (text, text),
        FUNCTION        6      (text,text) gin_trgm_triconsistent (internal, int2, text, int4, internal, internal, internal);
--- a/contrib/pg_trgm/sql/pg_trgm.sql
+++ b/contrib/pg_trgm/sql/pg_trgm.sql
@@ -13,7 +13,7 @@ select similarity('wow',' WOW ');
 select similarity('---', '####---');
-CREATE TABLE test_trgm(t text);
+CREATE TABLE test_trgm(t text COLLATE "C");
 \copy test_trgm from 'data/trgm.data'
@@ -40,7 +40,7 @@ select t,similarity(t,'qwertyu0988') as sml from test_trgm where t % 'qwertyu098
 select t,similarity(t,'gwertyu0988') as sml from test_trgm where t % 'gwertyu0988' order by sml desc, t;
 select t,similarity(t,'gwertyu1988') as sml from test_trgm where t % 'gwertyu1988' order by sml desc, t;
-create table test2(t text);
+create table test2(t text COLLATE "C");
 insert into test2 values ('abcdef');
 insert into test2 values ('quark');
 insert into test2 values ('  z foo bar');
--- a/contrib/pg_trgm/trgm.h
+++ b/contrib/pg_trgm/trgm.h
@@ -26,13 +26,14 @@
 #define DIVUNION
 /* operator strategy numbers */
-#define SimilarityStrategyNumber	1
+#define SimilarityStrategyNumber		1
-#define DistanceStrategyNumber		2
+#define DistanceStrategyNumber			2
-#define LikeStrategyNumber			3
+#define LikeStrategyNumber				3
-#define ILikeStrategyNumber			4
+#define ILikeStrategyNumber				4
-#define RegExpStrategyNumber		5
+#define RegExpStrategyNumber			5
-#define RegExpICaseStrategyNumber	6
+#define RegExpICaseStrategyNumber		6
-
+#define WordSimilarityStrategyNumber	7
 #define WordDistanceStrategyNumber		8
 typedef char trgm[3];
@@ -103,15 +104,28 @@ typedef char *BITVECP;
 #define GETARR(x)		( (trgm*)( (char*)x+TRGMHDRSIZE ) )
 #define ARRNELEM(x) ( ( VARSIZE(x) - TRGMHDRSIZE )/sizeof(trgm) )
 /*
 * If DIVUNION is defined then similarity formula is:
 * count / (len1 + len2 - count)
 * else if DIVUNION is not defined then similarity formula is:
 * count / max(len1, len2)
 */
 #ifdef DIVUNION
 #define CALCSML(count, len1, len2) ((float4) (count)) / ((float4) ((len1) + (len2) - (count)))
 #else
 #define CALCSML(count, len1, len2) ((float4) (count)) / ((float4) (((len1) > (len2)) ? (len1) : (len2)))
 #endif
 typedef struct TrgmPackedGraph TrgmPackedGraph;
 extern double similarity_threshold;
 extern double word_similarity_threshold;
 extern uint32 trgm2int(trgm *ptr);
 extern void compact_trigram(trgm *tptr, char *str, int bytelen);
 extern TRGM *generate_trgm(char *str, int slen);
 extern TRGM *generate_wildcard_trgm(const char *str, int slen);
-extern float4 cnt_sml(TRGM *trg1, TRGM *trg2);
+extern float4 cnt_sml(TRGM *trg1, TRGM *trg2, bool inexact);
 extern bool trgm_contained_by(TRGM *trg1, TRGM *trg2);
 extern bool *trgm_presence_map(TRGM *query, TRGM *key);
 extern TRGM *createTrgmNFA(text *text_re, Oid collation,
--- a/contrib/pg_trgm/trgm_gin.c
+++ b/contrib/pg_trgm/trgm_gin.c
@@ -89,6 +89,7 @@ gin_extract_query_trgm(PG_FUNCTION_ARGS)
 	switch (strategy)
 	{
 		case SimilarityStrategyNumber:
 		case WordSimilarityStrategyNumber:
 			trg = generate_trgm(VARDATA(val), VARSIZE(val) - VARHDRSZ);
 			break;
 		case ILikeStrategyNumber:
@@ -176,6 +177,7 @@ gin_trgm_consistent(PG_FUNCTION_ARGS)
 	bool		res;
 	int32		i,
 				ntrue;
 	double		nlimit;
 	/* All cases served by this function are inexact */
 	*recheck = true;
@@ -183,6 +185,10 @@ gin_trgm_consistent(PG_FUNCTION_ARGS)
 	switch (strategy)
 	{
 		case SimilarityStrategyNumber:
 		case WordSimilarityStrategyNumber:
 			nlimit = (strategy == SimilarityStrategyNumber) ?
 				similarity_threshold : word_similarity_threshold;
 			/* Count the matches */
 			ntrue = 0;
 			for (i = 0; i < nkeys; i++)
@@ -207,8 +213,7 @@ gin_trgm_consistent(PG_FUNCTION_ARGS)
 			 * So, independly on DIVUNION the upper bound formula is the same.
 			 */
 			res = (nkeys == 0) ? false :
-				((((((float4) ntrue) / ((float4) nkeys))) >= similarity_threshold)
+				(((((float4) ntrue) / ((float4) nkeys))) >= nlimit);
 					? true : false);
 			break;
 		case ILikeStrategyNumber:
 #ifndef IGNORECASE
@@ -270,10 +275,15 @@ gin_trgm_triconsistent(PG_FUNCTION_ARGS)
 	int32		i,
 				ntrue;
 	bool	   *boolcheck;
 	double		nlimit;
 	switch (strategy)
 	{
 		case SimilarityStrategyNumber:
 		case WordSimilarityStrategyNumber:
 			nlimit = (strategy == SimilarityStrategyNumber) ?
 				similarity_threshold : word_similarity_threshold;
 			/* Count the matches */
 			ntrue = 0;
 			for (i = 0; i < nkeys; i++)
@@ -285,9 +295,9 @@ gin_trgm_triconsistent(PG_FUNCTION_ARGS)
 			/*
 			 * See comment in gin_trgm_consistent() about * upper bound formula
 			 */
-			res = (nkeys == 0) ? GIN_FALSE :
+			res = (nkeys == 0)
-				(((((float4) ntrue) / ((float4) nkeys)) >= similarity_threshold)
+				? GIN_FALSE : (((((float4) ntrue) / ((float4) nkeys)) >= nlimit)
-					? GIN_MAYBE : GIN_FALSE);
+							? GIN_MAYBE : GIN_FALSE);
 			break;
 		case ILikeStrategyNumber:
 #ifndef IGNORECASE
--- a/contrib/pg_trgm/trgm_gist.c
+++ b/contrib/pg_trgm/trgm_gist.c
@@ -191,6 +191,7 @@ gtrgm_consistent(PG_FUNCTION_ARGS)
 	bool		res;
 	Size		querysize = VARSIZE(query);
 	gtrgm_consistent_cache *cache;
 	double		nlimit;
 	/*
 	 * We keep the extracted trigrams in cache, because trigram extraction is
@@ -218,6 +219,7 @@ gtrgm_consistent(PG_FUNCTION_ARGS)
 		switch (strategy)
 		{
 			case SimilarityStrategyNumber:
 			case WordSimilarityStrategyNumber:
 				qtrg = generate_trgm(VARDATA(query),
 									 querysize - VARHDRSZ);
 				break;
@@ -286,16 +288,23 @@ gtrgm_consistent(PG_FUNCTION_ARGS)
 	switch (strategy)
 	{
 		case SimilarityStrategyNumber:
-			/* Similarity search is exact */
+		case WordSimilarityStrategyNumber:
-			*recheck = false;
+			/* Similarity search is exact. Word similarity search is inexact */
 			*recheck = (strategy == WordSimilarityStrategyNumber);
 			nlimit = (strategy == SimilarityStrategyNumber) ?
 				similarity_threshold : word_similarity_threshold;
 			if (GIST_LEAF(entry))
 			{					/* all leafs contains orig trgm */
-				float4		tmpsml = cnt_sml(key, qtrg);
+				/*
 				 * Prevent gcc optimizing the tmpsml variable using volatile
 				 * keyword. Otherwise comparison of nlimit and tmpsml may give
 				 * wrong results.
 				 */
 				float4 volatile tmpsml = cnt_sml(qtrg, key, *recheck);
 				/* strange bug at freebsd 5.2.1 and gcc 3.3.3 */
-				res = (*(int *) &tmpsml == *(int *) &similarity_threshold
+				res = (*(int *) &tmpsml == *(int *) &nlimit || tmpsml > nlimit);
 						|| tmpsml > similarity_threshold) ? true : false;
 			}
 			else if (ISALLTRUE(key))
 			{					/* non-leaf contains signature */
@@ -309,8 +318,7 @@ gtrgm_consistent(PG_FUNCTION_ARGS)
 				if (len == 0)
 					res = false;
 				else
-					res = (((((float8) count) / ((float8) len))) >= similarity_threshold)
+					res = (((((float8) count) / ((float8) len))) >= nlimit);
 							? true : false;
 			}
 			break;
 		case ILikeStrategyNumber:
@@ -428,6 +436,7 @@ gtrgm_distance(PG_FUNCTION_ARGS)
 	StrategyNumber strategy = (StrategyNumber) PG_GETARG_UINT16(2);
 	/* Oid		subtype = PG_GETARG_OID(3); */
 	bool	   *recheck = (bool *) PG_GETARG_POINTER(4);
 	TRGM	   *key = (TRGM *) DatumGetPointer(entry->key);
 	TRGM	   *qtrg;
 	float8		res;
@@ -463,9 +472,17 @@ gtrgm_distance(PG_FUNCTION_ARGS)
 	switch (strategy)
 	{
 		case DistanceStrategyNumber:
 		case WordDistanceStrategyNumber:
 			*recheck = strategy == WordDistanceStrategyNumber;
 			if (GIST_LEAF(entry))
 			{					/* all leafs contains orig trgm */
-				res = 1.0 - cnt_sml(key, qtrg);
+				/*
 				 * Prevent gcc optimizing the sml variable using volatile
 				 * keyword. Otherwise res can differ from the
 				 * word_similarity_dist_op() function.
 				 */
 				float4 volatile sml = cnt_sml(qtrg, key, *recheck);
 				res = 1.0 - sml;
 			}
 			else if (ISALLTRUE(key))
 			{					/* all leafs contains orig trgm */
--- a/contrib/pg_trgm/trgm_op.c
+++ b/contrib/pg_trgm/trgm_op.c
@@ -15,7 +15,8 @@
 PG_MODULE_MAGIC;
 /* GUC variables */
-double		similarity_threshold = 0.3f;
+double similarity_threshold = 0.3f;
 double word_similarity_threshold = 0.6f;
 void		_PG_init(void);
@@ -23,8 +24,20 @@ PG_FUNCTION_INFO_V1(set_limit);
 PG_FUNCTION_INFO_V1(show_limit);
 PG_FUNCTION_INFO_V1(show_trgm);
 PG_FUNCTION_INFO_V1(similarity);
 PG_FUNCTION_INFO_V1(word_similarity);
 PG_FUNCTION_INFO_V1(similarity_dist);
 PG_FUNCTION_INFO_V1(similarity_op);
 PG_FUNCTION_INFO_V1(word_similarity_op);
 PG_FUNCTION_INFO_V1(word_similarity_commutator_op);
 PG_FUNCTION_INFO_V1(word_similarity_dist_op);
 PG_FUNCTION_INFO_V1(word_similarity_dist_commutator_op);
 /* Trigram with position */
 typedef struct
 {
 	trgm	trg;
 	int		index;
 } pos_trgm;
 /*
 * Module load callback
@@ -45,11 +58,23 @@ _PG_init(void)
 							NULL,
 							NULL,
 							NULL);
 	DefineCustomRealVariable("pg_trgm.word_similarity_threshold",
 							"Sets the threshold used by the <%% operator.",
 							"Valid range is 0.0 .. 1.0.",
 							&word_similarity_threshold,
 							0.6,
 							0.0,
 							1.0,
 							PGC_USERSET,
 							0,
 							NULL,
 							NULL,
 							NULL);
 }
 /*
 * Deprecated function.
- * Use "pg_trgm.similarity_threshold" GUC variable instead of this function
+ * Use "pg_trgm.similarity_threshold" GUC variable instead of this function.
 */
 Datum
 set_limit(PG_FUNCTION_ARGS)
@@ -59,14 +84,14 @@ set_limit(PG_FUNCTION_ARGS)
 	if (nlimit < 0 || nlimit > 1.0)
 		ereport(ERROR,
 				(errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
-				 errmsg("wrong limit, should be between 0 and 1")));
+				 errmsg("wrong threshold, should be between 0 and 1")));
 	similarity_threshold = nlimit;
 	PG_RETURN_FLOAT4(similarity_threshold);
 }
 /*
 * Deprecated function.
- * Use "pg_trgm.similarity_threshold" GUC variable instead of this function
+ * Use "pg_trgm.similarity_threshold" GUC variable instead of this function.
 */
 Datum
 show_limit(PG_FUNCTION_ARGS)
@@ -199,38 +224,28 @@ make_trigrams(trgm *tptr, char *str, int bytelen, int charlen)
 	return tptr;
 }
-TRGM *
+/*
-generate_trgm(char *str, int slen)
+ * Make array of trigrams without sorting and removing duplicate items.
 *
 * trg: where to return the array of trigrams.
 * str: source string, of length slen bytes.
 *
 * Returns length of the generated array.
 */
 static int
 generate_trgm_only(trgm *trg, char *str, int slen)
 {
 	TRGM	   *trg;
 	char	   *buf;
 	trgm	   *tptr;
-	int			len,
+	char	   *buf;
-				charlen,
+	int			charlen,
 				bytelen;
 	char	   *bword,
 			   *eword;
 	/*
 	 * Guard against possible overflow in the palloc requests below.  (We
 	 * don't worry about the additive constants, since palloc can detect
 	 * requests that are a little above MaxAllocSize --- we just need to
 	 * prevent integer overflow in the multiplications.)
 	 */
 	if ((Size) (slen / 2) >= (MaxAllocSize / (sizeof(trgm) * 3)) ||
 		(Size) slen >= (MaxAllocSize / pg_database_encoding_max_length()))
 		ereport(ERROR,
 				(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
 				 errmsg("out of memory")));
 	trg = (TRGM *) palloc(TRGMHDRSIZE + sizeof(trgm) * (slen / 2 + 1) *3);
 	trg->flag = ARRKEY;
 	SET_VARSIZE(trg, TRGMHDRSIZE);
 	if (slen + LPADDING + RPADDING < 3 || slen == 0)
-		return trg;
+		return 0;
-	tptr = GETARR(trg);
+	tptr = trg;
 	/* Allocate a buffer for case-folded, blank-padded words */
 	buf = (char *) palloc(slen * pg_database_encoding_max_length() + 4);
@@ -270,7 +285,47 @@ generate_trgm(char *str, int slen)
 	pfree(buf);
-	if ((len = tptr - GETARR(trg)) == 0)
+	return tptr - trg;
 }
 /*
 * Guard against possible overflow in the palloc requests below.  (We
 * don't worry about the additive constants, since palloc can detect
 * requests that are a little above MaxAllocSize --- we just need to
 * prevent integer overflow in the multiplications.)
 */
 static void
 protect_out_of_mem(int slen)
 {
 	if ((Size) (slen / 2) >= (MaxAllocSize / (sizeof(trgm) * 3)) ||
 		(Size) slen >= (MaxAllocSize / pg_database_encoding_max_length()))
 		ereport(ERROR,
 				(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
 				 errmsg("out of memory")));
 }
 /*
 * Make array of trigrams with sorting and removing duplicate items.
 *
 * str: source string, of length slen bytes.
 *
 * Returns the sorted array of unique trigrams.
 */
 TRGM *
 generate_trgm(char *str, int slen)
 {
 	TRGM	   *trg;
 	int			len;
 	protect_out_of_mem(slen);
 	trg = (TRGM *) palloc(TRGMHDRSIZE + sizeof(trgm) * (slen / 2 + 1) *3);
 	trg->flag = ARRKEY;
 	len = generate_trgm_only(GETARR(trg), str, slen);
 	SET_VARSIZE(trg, CALCGTSIZE(ARRKEY, len));
 	if (len == 0)
 		return trg;
 	/*
@@ -287,6 +342,285 @@ generate_trgm(char *str, int slen)
 	return trg;
 }
 /*
 * Make array of positional trigrams from two trigram arrays trg1 and trg2.
 *
 * trg1: trigram array of search pattern, of length len1. trg1 is required
 *       word which positions don't matter and replaced with -1.
 * trg2: trigram array of text, of length len2. trg2 is haystack where we
 *       search and have to store its positions.
 *
 * Returns concatenated trigram array.
 */
 static pos_trgm *
 make_positional_trgm(trgm *trg1, int len1, trgm *trg2, int len2)
 {
 	pos_trgm   *result;
 	int			i, len = len1 + len2;
 	result = (pos_trgm *) palloc(sizeof(pos_trgm) * len);
 	for (i = 0; i < len1; i++)
 	{
 		memcpy(&result[i].trg, &trg1[i], sizeof(trgm));
 		result[i].index = -1;
 	}
 	for (i = 0; i < len2; i++)
 	{
 		memcpy(&result[i + len1].trg, &trg2[i], sizeof(trgm));
 		result[i + len1].index = i;
 	}
 	return result;
 }
 /*
 * Compare position trigrams: compare trigrams first and position second.
 */
 static int
 comp_ptrgm(const void *v1, const void *v2)
 {
 	const pos_trgm *p1 = (const pos_trgm *)v1;
 	const pos_trgm *p2 = (const pos_trgm *)v2;
 	int				cmp;
 	cmp = CMPTRGM(p1->trg, p2->trg);
 	if (cmp != 0)
 		return cmp;
 	if (p1->index < p2->index)
 		return -1;
 	else if (p1->index == p2->index)
 		return 0;
 	else
 		return 1;
 }
 /*
 * Iterative search function which calculates maximum similarity with word in
 * the string. But maximum similarity is calculated only if check_only == false.
 *
 * trg2indexes: array which stores indexes of the array "found".
 * found: array which stores true of false values.
 * ulen1: count of unique trigrams of array "trg1".
 * len2: length of array "trg2" and array "trg2indexes".
 * len: length of the array "found".
 * check_only: if true then only check existaince of similar search pattern in
 *             text.
 *
 * Returns word similarity.
 */
 static float4
 iterate_word_similarity(int *trg2indexes,
 						bool *found,
 						int ulen1,
 						int len2,
 						int len,
 						bool check_only)
 {
 	int		   *lastpos,
 				i,
 				ulen2 = 0,
 				count = 0,
 				upper = -1,
 				lower = -1;
 	float4		smlr_cur,
 				smlr_max = 0.0f;
 	/* Memorise last position of each trigram */
 	lastpos = (int *) palloc(sizeof(int) * len);
 	memset(lastpos, -1, sizeof(int) * len);
 	for (i = 0; i < len2; i++)
 	{
 		/* Get index of next trigram */
 		int	trgindex = trg2indexes[i];
 		/* Update last position of this trigram */
 		if (lower >= 0 || found[trgindex])
 		{
 			if (lastpos[trgindex] < 0)
 			{
 				ulen2++;
 				if (found[trgindex])
 					count++;
 			}
 			lastpos[trgindex] = i;
 		}
 		/* Adjust lower bound if this trigram is present in required substing */
 		if (found[trgindex])
 		{
 			int		prev_lower,
 					tmp_ulen2,
 					tmp_lower,
 					tmp_count;
 			upper = i;
 			if (lower == -1)
 			{
 				lower = i;
 				ulen2 = 1;
 			}
 			smlr_cur = CALCSML(count, ulen1, ulen2);
 			/* Also try to adjust upper bound for greater similarity */
 			tmp_count = count;
 			tmp_ulen2 = ulen2;
 			prev_lower = lower;
 			for (tmp_lower = lower; tmp_lower <= upper; tmp_lower++)
 			{
 				float	smlr_tmp = CALCSML(tmp_count, ulen1, tmp_ulen2);
 				int		tmp_trgindex;
 				if (smlr_tmp > smlr_cur)
 				{
 					smlr_cur = smlr_tmp;
 					ulen2 = tmp_ulen2;
 					lower = tmp_lower;
 					count = tmp_count;
 				}
 				/*
 				 * if we only check that word similarity is greater than
 				 * pg_trgm.word_similarity_threshold we do not need to calculate
 				 * a maximum similarity.
 				 */
 				if (check_only && smlr_cur >= word_similarity_threshold)
 					break;
 				tmp_trgindex = trg2indexes[tmp_lower];
 				if (lastpos[tmp_trgindex] == tmp_lower)
 				{
 					tmp_ulen2--;
 					if (found[tmp_trgindex])
 						tmp_count--;
 				}
 			}
 			smlr_max = Max(smlr_max, smlr_cur);
 			/*
 			 * if we only check that word similarity is greater than
 			 * pg_trgm.word_similarity_threshold we do not need to calculate a
 			 * maximum similarity
 			 */
 			if (check_only && smlr_max >= word_similarity_threshold)
 				break;
 			for (tmp_lower = prev_lower; tmp_lower < lower; tmp_lower++)
 			{
 				int		tmp_trgindex;
 				tmp_trgindex = trg2indexes[tmp_lower];
 				if (lastpos[tmp_trgindex] == tmp_lower)
 					lastpos[tmp_trgindex] = -1;
 			}
 		}
 	}
 	pfree(lastpos);
 	return smlr_max;
 }
 /*
 * Calculate word similarity.
 * This function prepare two arrays: "trg2indexes" and "found". Then this arrays
 * are used to calculate word similarity using iterate_word_similarity().
 *
 * "trg2indexes" is array which stores indexes of the array "found".
 * In other words:
 * trg2indexes[j] = i;
 * found[i] = true (or false);
 * If found[i] == true then there is trigram trg2[j] in array "trg1".
 * If found[i] == false then there is not trigram trg2[j] in array "trg1".
 *
 * str1: search pattern string, of length slen1 bytes.
 * str2: text in which we are looking for a word, of length slen2 bytes.
 * check_only: if true then only check existaince of similar search pattern in
 *             text.
 *
 * Returns word similarity.
 */
 static float4
 calc_word_similarity(char *str1, int slen1, char *str2, int slen2,
 						  bool check_only)
 {
 	bool	   *found;
 	pos_trgm   *ptrg;
 	trgm	   *trg1;
 	trgm	   *trg2;
 	int			len1,
 				len2,
 				len,
 				i,
 				j,
 				ulen1;
 	int		   *trg2indexes;
 	float4		result;
 	protect_out_of_mem(slen1 + slen2);
 	/* Make positional trigrams */
 	trg1 = (trgm *) palloc(sizeof(trgm) * (slen1 / 2 + 1) * 3);
 	trg2 = (trgm *) palloc(sizeof(trgm) * (slen2 / 2 + 1) * 3);
 	len1 = generate_trgm_only(trg1, str1, slen1);
 	len2 = generate_trgm_only(trg2, str2, slen2);
 	ptrg = make_positional_trgm(trg1, len1, trg2, len2);
 	len = len1 + len2;
 	qsort(ptrg, len, sizeof(pos_trgm), comp_ptrgm);
 	pfree(trg1);
 	pfree(trg2);
 	/*
 	 * Merge positional trigrams array: enumerate each trigram and find its
 	 * presence in required word.
 	 */
 	trg2indexes = (int *) palloc(sizeof(int) * len2);
 	found = (bool *) palloc0(sizeof(bool) * len);
 	ulen1 = 0;
 	j = 0;
 	for (i = 0; i < len; i++)
 	{
 		if (i > 0)
 		{
 			int cmp = CMPTRGM(ptrg[i - 1].trg, ptrg[i].trg);
 			if (cmp != 0)
 			{
 				if (found[j])
 					ulen1++;
 				j++;
 			}
 		}
 		if (ptrg[i].index >= 0)
 		{
 			trg2indexes[ptrg[i].index] = j;
 		}
 		else
 		{
 			found[j] = true;
 		}
 	}
 	if (found[j])
 		ulen1++;
 	/* Run iterative procedure to find maximum similarity with word */
 	result = iterate_word_similarity(trg2indexes, found, ulen1, len2, len,
 										  check_only);
 	pfree(trg2indexes);
 	pfree(found);
 	pfree(ptrg);
 	return result;
 }
 /*
 * Extract the next non-wildcard part of a search string, ie, a word bounded
 * by '_' or '%' meta-characters, non-word characters or string end.
@@ -459,17 +793,7 @@ generate_wildcard_trgm(const char *str, int slen)
 				bytelen;
 	const char *eword;
-	/*
+	protect_out_of_mem(slen);
 	 * Guard against possible overflow in the palloc requests below.  (We
 	 * don't worry about the additive constants, since palloc can detect
 	 * requests that are a little above MaxAllocSize --- we just need to
 	 * prevent integer overflow in the multiplications.)
 	 */
 	if ((Size) (slen / 2) >= (MaxAllocSize / (sizeof(trgm) * 3)) ||
 		(Size) slen >= (MaxAllocSize / pg_database_encoding_max_length()))
 		ereport(ERROR,
 				(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
 				 errmsg("out of memory")));
 	trg = (TRGM *) palloc(TRGMHDRSIZE + sizeof(trgm) * (slen / 2 + 1) *3);
 	trg->flag = ARRKEY;
@@ -590,7 +914,7 @@ show_trgm(PG_FUNCTION_ARGS)
 }
 float4
-cnt_sml(TRGM *trg1, TRGM *trg2)
+cnt_sml(TRGM *trg1, TRGM *trg2, bool inexact)
 {
 	trgm	   *ptr1,
 			   *ptr2;
@@ -624,14 +948,15 @@ cnt_sml(TRGM *trg1, TRGM *trg2)
 		}
 	}
-#ifdef DIVUNION
+	/*
-	return ((float4) count) / ((float4) (len1 + len2 - count));
+	 * If inexact then len2 is equal to count, because we don't know actual
-#else
+	 * length of second string in inexact search and we can assume that count
-	return ((float4) count) / ((float4) ((len1 > len2) ? len1 : len2));
+	 * is a lower bound of len2.
-#endif
+	 */
-
+	return CALCSML(count, len1, inexact ? count : len2);
 }
 /*
 * Returns whether trg2 contains all trigrams in trg1.
 * This relies on the trigram arrays being sorted.
@@ -726,7 +1051,7 @@ similarity(PG_FUNCTION_ARGS)
 	trg1 = generate_trgm(VARDATA(in1), VARSIZE(in1) - VARHDRSZ);
 	trg2 = generate_trgm(VARDATA(in2), VARSIZE(in2) - VARHDRSZ);
-	res = cnt_sml(trg1, trg2);
+	res = cnt_sml(trg1, trg2, false);
 	pfree(trg1);
 	pfree(trg2);
@@ -736,6 +1061,22 @@ similarity(PG_FUNCTION_ARGS)
 	PG_RETURN_FLOAT4(res);
 }
 Datum
 word_similarity(PG_FUNCTION_ARGS)
 {
 	text	   *in1 = PG_GETARG_TEXT_PP(0);
 	text	   *in2 = PG_GETARG_TEXT_PP(1);
 	float4		res;
 	res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
 								VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
 								false);
 	PG_FREE_IF_COPY(in1, 0);
 	PG_FREE_IF_COPY(in2, 1);
 	PG_RETURN_FLOAT4(res);
 }
 Datum
 similarity_dist(PG_FUNCTION_ARGS)
 {
@@ -755,3 +1096,67 @@ similarity_op(PG_FUNCTION_ARGS)
 	PG_RETURN_BOOL(res >= similarity_threshold);
 }
 Datum
 word_similarity_op(PG_FUNCTION_ARGS)
 {
 	text	   *in1 = PG_GETARG_TEXT_PP(0);
 	text	   *in2 = PG_GETARG_TEXT_PP(1);
 	float4		res;
 	res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
 								VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
 								true);
 	PG_FREE_IF_COPY(in1, 0);
 	PG_FREE_IF_COPY(in2, 1);
 	PG_RETURN_BOOL(res >= word_similarity_threshold);
 }
 Datum
 word_similarity_commutator_op(PG_FUNCTION_ARGS)
 {
 	text	   *in1 = PG_GETARG_TEXT_PP(0);
 	text	   *in2 = PG_GETARG_TEXT_PP(1);
 	float4		res;
 	res = calc_word_similarity(VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
 								VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
 								true);
 	PG_FREE_IF_COPY(in1, 0);
 	PG_FREE_IF_COPY(in2, 1);
 	PG_RETURN_BOOL(res >= word_similarity_threshold);
 }
 Datum
 word_similarity_dist_op(PG_FUNCTION_ARGS)
 {
 	text	   *in1 = PG_GETARG_TEXT_PP(0);
 	text	   *in2 = PG_GETARG_TEXT_PP(1);
 	float4		res;
 	res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
 								VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
 								false);
 	PG_FREE_IF_COPY(in1, 0);
 	PG_FREE_IF_COPY(in2, 1);
 	PG_RETURN_FLOAT4(1.0 - res);
 }
 Datum
 word_similarity_dist_commutator_op(PG_FUNCTION_ARGS)
 {
 	text	   *in1 = PG_GETARG_TEXT_PP(0);
 	text	   *in2 = PG_GETARG_TEXT_PP(1);
 	float4		res;
 	res = calc_word_similarity(VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
 								VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
 								false);
 	PG_FREE_IF_COPY(in1, 0);
 	PG_FREE_IF_COPY(in2, 1);
 	PG_RETURN_FLOAT4(1.0 - res);
 }
--- a/doc/src/sgml/pgtrgm.sgml
+++ b/doc/src/sgml/pgtrgm.sgml
@@ -92,6 +92,21 @@
       (In practice this is seldom useful except for debugging.)
      </entry>
     </row>
     <row>
      <entry>
       <function>word_similarity(text, text)</function>
       <indexterm><primary>word_similarity</primary></indexterm>
      </entry>
      <entry><type>real</type></entry>
      <entry>
       Returns a number that indicates how similar the first string
       to the most similar word of the second string. The function searches in
       the second string a most similar word not a most similar substring.  The
       range of the result is zero (indicating that the two strings are
       completely dissimilar) to one (indicating that the first string is
       identical to one of the word of the second string).
      </entry>
     </row>
     <row>
      <entry><function>show_limit()</function><indexterm><primary>show_limit</primary></indexterm></entry>
      <entry><type>real</type></entry>
@@ -137,6 +152,16 @@
       <varname>pg_trgm.similarity_threshold</>.
      </entry>
     </row>
     <row>
       <entry><type>text</> <literal>%&gt;</literal> <type>text</></entry>
       <entry><type>boolean</type></entry>
       <entry>
        Returns <literal>true</> if its first argument has the similar word in
        the second argument and they have a similarity that is greater than the
        current word similarity threshold set by
        <varname>pg_trgm.word_similarity_threshold</> parameter.
       </entry>
      </row>
     <row>
      <entry><type>text</> <literal>&lt;-&gt;</literal> <type>text</></entry>
      <entry><type>real</type></entry>
@@ -145,6 +170,16 @@
       one minus the <function>similarity()</> value.
      </entry>
     </row>
     <row>
       <entry>
        <type>text</> <literal>&lt;-&gt;&gt;</literal> <type>text</>
       </entry>
       <entry><type>real</type></entry>
       <entry>
        Returns the <quote>distance</> between the arguments, that is
        one minus the <function>word_similarity()</> value.
       </entry>
     </row>
    </tbody>
   </tgroup>
  </table>
@@ -168,6 +203,23 @@
     </para>
    </listitem>
   </varlistentry>
    <varlistentry id="guc-pgtrgm-word-similarity-threshold" xreflabel="pg_trgm.word_similarity_threshold">
     <term>
      <varname>pg_trgm.word_similarity_threshold</> (<type>real</type>)
      <indexterm>
       <primary>
        <varname>pg_trgm.word_similarity_threshold</> configuration parameter
       </primary>
      </indexterm>
     </term>
     <listitem>
      <para>
       Sets the current word similarity threshold that is used by
       the <literal>%&gt;</> operator.  The threshold must be between
       0 and 1 (default is 0.6).
      </para>
     </listitem>
    </varlistentry>
  </variablelist>
 </sect2>
@@ -225,6 +277,33 @@ SELECT t, t &lt;-&gt; '<replaceable>word</>' AS dist
   a small number of the closest matches is wanted.
  </para>
  <para>
   Also you can use an index on the <structfield>t</> column for word
   similarity.  For example:
 <programlisting>
 SELECT t, word_similarity('<replaceable>word</>', t) AS sml
  FROM test_trgm
  WHERE t %&gt; '<replaceable>word</>'
  ORDER BY sml DESC, t;
 </programlisting>
   This will return all values in the text column that have a word
   which sufficiently similar to <replaceable>word</>, sorted from best
   match to worst.  The index will be used to make this a fast operation
   even over very large data sets.
  </para>
  <para>
   A variant of the above query is
 <programlisting>
 SELECT t, t &lt;-&gt;&gt; '<replaceable>word</>' AS dist
  FROM test_trgm
  ORDER BY dist LIMIT 10;
 </programlisting>
   This can be implemented quite efficiently by GiST indexes, but not
   by GIN indexes.
  </para>
  <para>
   Beginning in <productname>PostgreSQL</> 9.1, these index types also support
   index searches for <literal>LIKE</> and <literal>ILIKE</>, for example