txtidx datatype for full text indexing with GiST.

From Oleg Bartunov and Teodor Sigaev.
2026-01-05 23:38:41 +03:00 · 2001-10-12 23:19:09 +00:00
parent c24216bea8
commit b57705673d
27 changed files with 24874 additions and 5 deletions
--- a/contrib/tsearch/morph.c
+++ b/contrib/tsearch/morph.c
@@ -0,0 +1,188 @@
+/*
+ * morphology module
+ * New dictionary is include in dict.h. For languages which
+ * use latin charset it may be need to modify mapdict table.
+ * Teodor Sigaev <teodor@stack.net>
+ */ 
+#include "postgres.h"
+
+#include "utils/elog.h"
+#include "utils/palloc.h"
+#include "utils/builtins.h"
+#include "catalog/pg_control.h"
+#include "utils/pg_locale.h"
+
+#include "morph.h"
+#include "deflex.h"
+
+/*
+ * Struct for calling dictionaries
+ * All of this methods are optional, but
+ * if all methods are NULL, then dictionary does nothing :)
+ * Return value of lemmatize must be palloced or the same.
+ * Return value of init must be malloced in other case 
+ * it will be free in end of transaction! 
+ */
+typedef struct {
+	char    localename[LOCALE_NAME_BUFLEN];
+	/* init dictionary */
+	void*	(*init)(void);
+	/* close dictionary */
+	void	(*close)(void*);
+	/* find in dictionary */
+	char*	(*lemmatize)(void*,char*,int*);
+	int	(*is_stoplemm)(void*,char*,int);
+	int	(*is_stemstoplemm)(void*,char*,int);
+} DICT;
+
+/* insert all dictionaries */
+#define DICT_BODY
+#include "dict.h"
+#undef  DICT_BODY
+
+/* fill dictionary's structure */  
+#define DICT_TABLE
+DICT dicts[] = {
+	{
+		"C",NULL,NULL,NULL,NULL,NULL	/* fake dictionary */
+	}
+#include "dict.h"
+};
+#undef DICT_TABLE
+
+/* array for storing dictinary's objects (if needed) */
+void*	dictobjs[ lengthof(dicts) ];
+
+#define STOPLEXEM	-2
+#define BYLOCALE	-1
+#define	NODICT		0
+#define DEFAULTDICT	1 
+	
+#define MAXNDICT	2
+typedef int2	MAPDICT[MAXNDICT];
+#define GETDICT(x,i)	*( ((int2*)(x)) + (i) )
+
+/* map dictionaries for lexem type */
+static MAPDICT mapdict[] = {
+	{NODICT,	NODICT},	/* not used 		*/
+	{DEFAULTDICT,	NODICT},	/* LATWORD		*/
+	{BYLOCALE,	NODICT},	/* NONLATINWORD		*/
+	{BYLOCALE,	DEFAULTDICT},	/* UWORD		*/
+	{NODICT,	NODICT},	/* EMAIL		*/
+	{NODICT,	NODICT},	/* FURL			*/
+	{NODICT,	NODICT},	/* HOST			*/
+	{NODICT,	NODICT},	/* FLOAT		*/
+	{NODICT,	NODICT},	/* FINT			*/
+	{BYLOCALE,	DEFAULTDICT},	/* PARTWORD		*/
+	{BYLOCALE,	NODICT},	/* NONLATINPARTWORD	*/
+	{DEFAULTDICT,	NODICT},	/* LATPARTWORD		*/
+	{STOPLEXEM,	NODICT},	/* SPACE		*/
+	{STOPLEXEM,	NODICT},	/* SYMTAG		*/
+	{STOPLEXEM,	NODICT},	/* HTTP			*/
+	{BYLOCALE,	DEFAULTDICT},	/* DEFISWORD		*/
+	{DEFAULTDICT,	NODICT},	/* DEFISLATWORD		*/
+	{BYLOCALE,	NODICT},	/* DEFISNONLATINWORD	*/
+	{NODICT,	NODICT},	/* URI			*/
+	{NODICT,	NODICT}		/* FILEPATH		*/
+};
+
+static bool inited=false;
+
+void initmorph(void) {
+	int i,j,k;
+	MAPDICT	*md;
+	bool 	needinit[ lengthof(dicts) ];
+#ifdef USE_LOCALE
+	PG_LocaleCategories     lc;
+
+	int	bylocaledict = NODICT;
+#endif
+
+	if ( inited ) return;
+	for(i=1; i<lengthof(dicts);i++)
+		needinit[i] = false;
+	
+#ifdef USE_LOCALE
+	PGLC_current(&lc);
+	for(i=1;i<lengthof(dicts);i++)
+		if (strcmp( dicts[i].localename, lc.lang ) == 0) {
+			bylocaledict = i;
+			break;
+		}
+	PGLC_free_categories(&lc);
+#endif
+
+	for(i=1; i<lengthof(mapdict);i++) {
+		k=0;
+		md = &mapdict[i];
+		for(j=0;j<MAXNDICT;j++) {
+			GETDICT(md,k) = GETDICT(md,j);
+			if ( GETDICT(md,k) == NODICT ) {
+				break;
+			} else if ( GETDICT(md,k) == BYLOCALE ) {
+#ifdef USE_LOCALE
+				if ( bylocaledict == NODICT )
+					continue;
+				GETDICT(md,k) = bylocaledict;
+#else
+				continue;
+#endif
+			}
+			if ( GETDICT(md,k) >= (int2)lengthof(dicts) )
+				continue;
+			needinit[ GETDICT(md,k) ] = true;
+			k++; 
+		}
+		for(;k<MAXNDICT;k++)
+			if ( GETDICT(md,k) != STOPLEXEM )
+				GETDICT(md,k) = NODICT;
+	}
+
+	for(i=1; i<lengthof(dicts);i++)
+		if ( needinit[i] && dicts[i].init )
+			dictobjs[i] = (*(dicts[i].init))();
+			 
+	inited = true;
+	return;
+}
+
+char* lemmatize( char* word, int *len, int type ) {
+	int2	nd;
+	int i;
+	DICT	*dict;
+
+	for(i=0;i<MAXNDICT;i++) {
+		nd = GETDICT( &mapdict[type], i );
+		if ( nd == NODICT ) {
+			/* there is no dictionary */ 
+			return word;
+		} else if ( nd == STOPLEXEM ) {
+			/* word is stopword */
+			return NULL;
+		} else {
+			dict = &dicts[ nd ];
+			if ( dict->is_stoplemm && (*(dict->is_stoplemm))(dictobjs[nd], word, *len) )
+				return NULL;
+			if ( dict->lemmatize ) {
+				int oldlen = *len;
+				char *newword = (*(dict->lemmatize))(dictobjs[nd], word, len);
+				/* word is recognized by distionary */
+				if ( newword != word || *len != oldlen ) {
+					if ( dict->is_stemstoplemm && 
+							(*(dict->is_stemstoplemm))(dictobjs[nd], word, *len) ) {
+						if ( newword != word && newword)
+							pfree(newword);
+						return NULL;
+					}
+					return newword;
+				}
+			}
+		}
+	}
+
+	return word;
+}
+
+bool is_stoptype(int type) {
+	return ( GETDICT( &mapdict[type], 0 ) == STOPLEXEM ) ? true : false;
+}