mirror of
				https://github.com/postgres/postgres.git
				synced 2025-10-25 13:17:41 +03:00 
			
		
		
		
	
		
			
				
	
	
		
			210 lines
		
	
	
		
			4.5 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
			
		
		
	
	
			210 lines
		
	
	
		
			4.5 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
| /*
 | |
|  * morphology module
 | |
|  * New dictionary is include in dict.h. For languages which
 | |
|  * use latin charset it may be need to modify mapdict table.
 | |
|  * Teodor Sigaev <teodor@stack.net>
 | |
|  */
 | |
| #include "postgres.h"
 | |
| 
 | |
| #include <locale.h>
 | |
| 
 | |
| #include "utils/builtins.h"
 | |
| 
 | |
| #include "morph.h"
 | |
| #include "deflex.h"
 | |
| 
 | |
| /*
 | |
|  * Struct for calling dictionaries
 | |
|  * All of this methods are optional, but
 | |
|  * if all methods are NULL, then dictionary does nothing :)
 | |
|  * Return value of lemmatize must be palloced or the same.
 | |
|  * Return value of init must be malloced in other case
 | |
|  * it will be free in end of transaction!
 | |
|  */
 | |
| typedef struct
 | |
| {
 | |
| 	char		localename[NAMEDATALEN];
 | |
| 	/* init dictionary */
 | |
| 	void	   *(*init) (void);
 | |
| 	/* close dictionary */
 | |
| 	void		(*close) (void *);
 | |
| 	/* find in dictionary */
 | |
| 	char	   *(*lemmatize) (void *, char *, int *);
 | |
| 	int			(*is_stoplemm) (void *, char *, int);
 | |
| 	int			(*is_stemstoplemm) (void *, char *, int);
 | |
| }	DICT;
 | |
| 
 | |
| /* insert all dictionaries */
 | |
| #define DICT_BODY
 | |
| #include "dict.h"
 | |
| #undef	DICT_BODY
 | |
| 
 | |
| /* fill dictionary's structure */
 | |
| #define DICT_TABLE
 | |
| DICT		dicts[] = {
 | |
| 	{
 | |
| 		"C", NULL, NULL, NULL, NULL, NULL		/* fake dictionary */
 | |
| 	}
 | |
| #include "dict.h"
 | |
| };
 | |
| 
 | |
| #undef DICT_TABLE
 | |
| 
 | |
| /* array for storing dictionary's objects (if needed) */
 | |
| void	   *dictobjs[lengthof(dicts)];
 | |
| 
 | |
| #define STOPLEXEM	-2
 | |
| #define BYLOCALE	-1
 | |
| #define NODICT		0
 | |
| #define DEFAULTDICT 1
 | |
| 
 | |
| #define MAXNDICT	2
 | |
| typedef int2 MAPDICT[MAXNDICT];
 | |
| 
 | |
| #define GETDICT(x,i)	*( ((int2*)(x)) + (i) )
 | |
| 
 | |
| /* map dictionaries for lexem type */
 | |
| static MAPDICT mapdict[] = {
 | |
| 	{NODICT, NODICT},			/* not used			*/
 | |
| 	{DEFAULTDICT, NODICT},		/* LATWORD		*/
 | |
| 	{BYLOCALE, NODICT},			/* NONLATINWORD		*/
 | |
| 	{BYLOCALE, DEFAULTDICT},	/* UWORD		*/
 | |
| 	{NODICT, NODICT},			/* EMAIL		*/
 | |
| 	{NODICT, NODICT},			/* FURL			*/
 | |
| 	{NODICT, NODICT},			/* HOST			*/
 | |
| 	{NODICT, NODICT},			/* SCIENTIFIC		*/
 | |
| 	{NODICT, NODICT},			/* VERSIONNUMBER		*/
 | |
| 	{BYLOCALE, DEFAULTDICT},	/* PARTHYPHENWORD		*/
 | |
| 	{BYLOCALE, NODICT},			/* CYRPARTHYPHENWORD */
 | |
| 	{DEFAULTDICT, NODICT},		/* LATPARTHYPHENWORD		*/
 | |
| 	{STOPLEXEM, NODICT},		/* SPACE		*/
 | |
| 	{STOPLEXEM, NODICT},		/* TAG		*/
 | |
| 	{STOPLEXEM, NODICT},		/* HTTP			*/
 | |
| 	{BYLOCALE, DEFAULTDICT},	/* HYPHENWORD		*/
 | |
| 	{DEFAULTDICT, NODICT},		/* LATHYPHENWORD		*/
 | |
| 	{BYLOCALE, NODICT},			/* CYRHYPHENWORD	*/
 | |
| 	{NODICT, NODICT},			/* URI			*/
 | |
| 	{NODICT, NODICT},			/* FILEPATH		*/
 | |
| 	{NODICT, NODICT},			/* DECIMAL		*/
 | |
| 	{NODICT, NODICT},			/* SIGNEDINT		*/
 | |
| 	{NODICT, NODICT},			/* UNSIGNEDINT		*/
 | |
| 	{STOPLEXEM, NODICT}			/* HTMLENTITY		*/
 | |
| };
 | |
| 
 | |
| static bool inited = false;
 | |
| 
 | |
| void
 | |
| initmorph(void)
 | |
| {
 | |
| 	int			i,
 | |
| 				j,
 | |
| 				k;
 | |
| 	MAPDICT    *md;
 | |
| 	bool		needinit[lengthof(dicts)];
 | |
| 	const char *curlocale;
 | |
| 	int			bylocaledict = NODICT;
 | |
| 
 | |
| 	if (inited)
 | |
| 		return;
 | |
| 	for (i = 1; i < lengthof(dicts); i++)
 | |
| 		needinit[i] = false;
 | |
| 
 | |
| 	curlocale = setlocale(LC_CTYPE, NULL);
 | |
| 	if (curlocale)
 | |
| 	{
 | |
| 		for (i = 1; i < lengthof(dicts); i++)
 | |
| 			if (strcmp(dicts[i].localename, curlocale) == 0)
 | |
| 			{
 | |
| 				bylocaledict = i;
 | |
| 				break;
 | |
| 			}
 | |
| 	}
 | |
| 
 | |
| 	for (i = 1; i < lengthof(mapdict); i++)
 | |
| 	{
 | |
| 		k = 0;
 | |
| 		md = &mapdict[i];
 | |
| 		for (j = 0; j < MAXNDICT; j++)
 | |
| 		{
 | |
| 			GETDICT(md, k) = GETDICT(md, j);
 | |
| 			if (GETDICT(md, k) == NODICT)
 | |
| 				break;
 | |
| 			else if (GETDICT(md, k) == BYLOCALE)
 | |
| 			{
 | |
| 				if (bylocaledict == NODICT)
 | |
| 					continue;
 | |
| 				GETDICT(md, k) = bylocaledict;
 | |
| 			}
 | |
| 			if (GETDICT(md, k) >= (int2) lengthof(dicts))
 | |
| 				continue;
 | |
| 			needinit[GETDICT(md, k)] = true;
 | |
| 			k++;
 | |
| 		}
 | |
| 		for (; k < MAXNDICT; k++)
 | |
| 			if (GETDICT(md, k) != STOPLEXEM)
 | |
| 				GETDICT(md, k) = NODICT;
 | |
| 	}
 | |
| 
 | |
| 	for (i = 1; i < lengthof(dicts); i++)
 | |
| 		if (needinit[i] && dicts[i].init)
 | |
| 			dictobjs[i] = (*(dicts[i].init)) ();
 | |
| 
 | |
| 	inited = true;
 | |
| 	return;
 | |
| }
 | |
| 
 | |
| char *
 | |
| lemmatize(char *word, int *len, int type)
 | |
| {
 | |
| 	int2		nd;
 | |
| 	int			i;
 | |
| 	DICT	   *dict;
 | |
| 
 | |
| 	for (i = 0; i < MAXNDICT; i++)
 | |
| 	{
 | |
| 		nd = GETDICT(&mapdict[type], i);
 | |
| 		if (nd == NODICT)
 | |
| 		{
 | |
| 			/* there is no dictionary */
 | |
| 			return word;
 | |
| 		}
 | |
| 		else if (nd == STOPLEXEM)
 | |
| 		{
 | |
| 			/* word is stopword */
 | |
| 			return NULL;
 | |
| 		}
 | |
| 		else
 | |
| 		{
 | |
| 			dict = &dicts[nd];
 | |
| 			if (dict->is_stoplemm && (*(dict->is_stoplemm)) (dictobjs[nd], word, *len))
 | |
| 				return NULL;
 | |
| 			if (dict->lemmatize)
 | |
| 			{
 | |
| 				int			oldlen = *len;
 | |
| 				char	   *newword = (*(dict->lemmatize)) (dictobjs[nd], word, len);
 | |
| 
 | |
| 				/* word is recognized by distionary */
 | |
| 				if (newword != word || *len != oldlen)
 | |
| 				{
 | |
| 					if (dict->is_stemstoplemm &&
 | |
| 					(*(dict->is_stemstoplemm)) (dictobjs[nd], word, *len))
 | |
| 					{
 | |
| 						if (newword != word && newword)
 | |
| 							pfree(newword);
 | |
| 						return NULL;
 | |
| 					}
 | |
| 					return newword;
 | |
| 				}
 | |
| 			}
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	return word;
 | |
| }
 | |
| 
 | |
| bool
 | |
| is_stoptype(int type)
 | |
| {
 | |
| 	return (GETDICT(&mapdict[type], 0) == STOPLEXEM) ? true : false;
 | |
| }
 |