1
0
mirror of https://github.com/postgres/postgres.git synced 2025-07-31 22:04:40 +03:00

Add word_similarity to pg_trgm contrib module.

Patch introduces a concept of similarity over string and just a word from
another string.

Version of extension is not changed because 1.2 was already introduced in 9.6
release cycle, so, there wasn't a public version.

Author: Alexander Korotkov, Artur Zakirov
This commit is contained in:
Teodor Sigaev
2016-03-16 18:59:21 +03:00
parent 1c4f001b79
commit f576b17cd6
10 changed files with 726 additions and 75 deletions

View File

@ -26,13 +26,14 @@
#define DIVUNION
/* operator strategy numbers */
#define SimilarityStrategyNumber 1
#define DistanceStrategyNumber 2
#define LikeStrategyNumber 3
#define ILikeStrategyNumber 4
#define RegExpStrategyNumber 5
#define RegExpICaseStrategyNumber 6
#define SimilarityStrategyNumber 1
#define DistanceStrategyNumber 2
#define LikeStrategyNumber 3
#define ILikeStrategyNumber 4
#define RegExpStrategyNumber 5
#define RegExpICaseStrategyNumber 6
#define WordSimilarityStrategyNumber 7
#define WordDistanceStrategyNumber 8
typedef char trgm[3];
@ -103,15 +104,28 @@ typedef char *BITVECP;
#define GETARR(x) ( (trgm*)( (char*)x+TRGMHDRSIZE ) )
#define ARRNELEM(x) ( ( VARSIZE(x) - TRGMHDRSIZE )/sizeof(trgm) )
/*
* If DIVUNION is defined then similarity formula is:
* count / (len1 + len2 - count)
* else if DIVUNION is not defined then similarity formula is:
* count / max(len1, len2)
*/
#ifdef DIVUNION
#define CALCSML(count, len1, len2) ((float4) (count)) / ((float4) ((len1) + (len2) - (count)))
#else
#define CALCSML(count, len1, len2) ((float4) (count)) / ((float4) (((len1) > (len2)) ? (len1) : (len2)))
#endif
typedef struct TrgmPackedGraph TrgmPackedGraph;
extern double similarity_threshold;
extern double word_similarity_threshold;
extern uint32 trgm2int(trgm *ptr);
extern void compact_trigram(trgm *tptr, char *str, int bytelen);
extern TRGM *generate_trgm(char *str, int slen);
extern TRGM *generate_wildcard_trgm(const char *str, int slen);
extern float4 cnt_sml(TRGM *trg1, TRGM *trg2);
extern float4 cnt_sml(TRGM *trg1, TRGM *trg2, bool inexact);
extern bool trgm_contained_by(TRGM *trg1, TRGM *trg2);
extern bool *trgm_presence_map(TRGM *query, TRGM *key);
extern TRGM *createTrgmNFA(text *text_re, Oid collation,