mirror of
				https://github.com/postgres/postgres.git
				synced 2025-10-29 22:49:41 +03:00 
			
		
		
		
	
		
			
				
	
	
		
			298 lines
		
	
	
		
			5.9 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
			
		
		
	
	
			298 lines
		
	
	
		
			5.9 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
| /*
 | |
|  * lexize stream of lexemes
 | |
|  * Teodor Sigaev <teodor@sigaev.ru>
 | |
|  */
 | |
| #include "postgres.h"
 | |
| 
 | |
| #include <ctype.h>
 | |
| #include <locale.h>
 | |
| 
 | |
| #include "ts_cfg.h"
 | |
| #include "dict.h"
 | |
| 
 | |
| void
 | |
| LexizeInit(LexizeData * ld, TSCfgInfo * cfg)
 | |
| {
 | |
| 	ld->cfg = cfg;
 | |
| 	ld->curDictId = InvalidOid;
 | |
| 	ld->posDict = 0;
 | |
| 	ld->towork.head = ld->towork.tail = ld->curSub = NULL;
 | |
| 	ld->waste.head = ld->waste.tail = NULL;
 | |
| 	ld->lastRes = NULL;
 | |
| 	ld->tmpRes = NULL;
 | |
| }
 | |
| 
 | |
| static void
 | |
| LPLAddTail(ListParsedLex * list, ParsedLex * newpl)
 | |
| {
 | |
| 	if (list->tail)
 | |
| 	{
 | |
| 		list->tail->next = newpl;
 | |
| 		list->tail = newpl;
 | |
| 	}
 | |
| 	else
 | |
| 		list->head = list->tail = newpl;
 | |
| 	newpl->next = NULL;
 | |
| }
 | |
| 
 | |
| static ParsedLex *
 | |
| LPLRemoveHead(ListParsedLex * list)
 | |
| {
 | |
| 	ParsedLex  *res = list->head;
 | |
| 
 | |
| 	if (list->head)
 | |
| 		list->head = list->head->next;
 | |
| 
 | |
| 	if (list->head == NULL)
 | |
| 		list->tail = NULL;
 | |
| 
 | |
| 	return res;
 | |
| }
 | |
| 
 | |
| 
 | |
| void
 | |
| LexizeAddLemm(LexizeData * ld, int type, char *lemm, int lenlemm)
 | |
| {
 | |
| 	ParsedLex  *newpl = (ParsedLex *) palloc(sizeof(ParsedLex));
 | |
| 
 | |
| 	newpl = (ParsedLex *) palloc(sizeof(ParsedLex));
 | |
| 	newpl->type = type;
 | |
| 	newpl->lemm = lemm;
 | |
| 	newpl->lenlemm = lenlemm;
 | |
| 	LPLAddTail(&ld->towork, newpl);
 | |
| 	ld->curSub = ld->towork.tail;
 | |
| }
 | |
| 
 | |
| static void
 | |
| RemoveHead(LexizeData * ld)
 | |
| {
 | |
| 	LPLAddTail(&ld->waste, LPLRemoveHead(&ld->towork));
 | |
| 
 | |
| 	ld->posDict = 0;
 | |
| }
 | |
| 
 | |
| static void
 | |
| setCorrLex(LexizeData * ld, ParsedLex ** correspondLexem)
 | |
| {
 | |
| 	if (correspondLexem)
 | |
| 	{
 | |
| 		*correspondLexem = ld->waste.head;
 | |
| 	}
 | |
| 	else
 | |
| 	{
 | |
| 		ParsedLex  *tmp,
 | |
| 				   *ptr = ld->waste.head;
 | |
| 
 | |
| 		while (ptr)
 | |
| 		{
 | |
| 			tmp = ptr->next;
 | |
| 			pfree(ptr);
 | |
| 			ptr = tmp;
 | |
| 		}
 | |
| 	}
 | |
| 	ld->waste.head = ld->waste.tail = NULL;
 | |
| }
 | |
| 
 | |
| static void
 | |
| moveToWaste(LexizeData * ld, ParsedLex * stop)
 | |
| {
 | |
| 	bool		go = true;
 | |
| 
 | |
| 	while (ld->towork.head && go)
 | |
| 	{
 | |
| 		if (ld->towork.head == stop)
 | |
| 		{
 | |
| 			ld->curSub = stop->next;
 | |
| 			go = false;
 | |
| 		}
 | |
| 		RemoveHead(ld);
 | |
| 	}
 | |
| }
 | |
| 
 | |
| static void
 | |
| setNewTmpRes(LexizeData * ld, ParsedLex * lex, TSLexeme * res)
 | |
| {
 | |
| 	if (ld->tmpRes)
 | |
| 	{
 | |
| 		TSLexeme   *ptr;
 | |
| 
 | |
| 		for (ptr = ld->tmpRes; ptr->lexeme; ptr++)
 | |
| 			pfree(ptr->lexeme);
 | |
| 		pfree(ld->tmpRes);
 | |
| 	}
 | |
| 	ld->tmpRes = res;
 | |
| 	ld->lastRes = lex;
 | |
| }
 | |
| 
 | |
| TSLexeme *
 | |
| LexizeExec(LexizeData * ld, ParsedLex ** correspondLexem)
 | |
| {
 | |
| 	int			i;
 | |
| 	ListDictionary *map;
 | |
| 	DictInfo   *dict;
 | |
| 	TSLexeme   *res;
 | |
| 
 | |
| 	if (ld->curDictId == InvalidOid)
 | |
| 	{
 | |
| 		/*
 | |
| 		 * usial mode: dictionary wants only one word, but we should keep in
 | |
| 		 * mind that we should go through all stack
 | |
| 		 */
 | |
| 
 | |
| 		while (ld->towork.head)
 | |
| 		{
 | |
| 			ParsedLex  *curVal = ld->towork.head;
 | |
| 
 | |
| 			map = ld->cfg->map + curVal->type;
 | |
| 
 | |
| 			if (curVal->type == 0 || curVal->type >= ld->cfg->len || map->len == 0)
 | |
| 			{
 | |
| 				/* skip this type of lexeme */
 | |
| 				RemoveHead(ld);
 | |
| 				continue;
 | |
| 			}
 | |
| 
 | |
| 			for (i = ld->posDict; i < map->len; i++)
 | |
| 			{
 | |
| 				dict = finddict(DatumGetObjectId(map->dict_id[i]));
 | |
| 
 | |
| 				ld->dictState.isend = ld->dictState.getnext = false;
 | |
| 				ld->dictState.private = NULL;
 | |
| 				res = (TSLexeme *) DatumGetPointer(FunctionCall4(
 | |
| 														&(dict->lexize_info),
 | |
| 										   PointerGetDatum(dict->dictionary),
 | |
| 											   PointerGetDatum(curVal->lemm),
 | |
| 											  Int32GetDatum(curVal->lenlemm),
 | |
| 											  PointerGetDatum(&ld->dictState)
 | |
| 																 ));
 | |
| 
 | |
| 				if (ld->dictState.getnext)
 | |
| 				{
 | |
| 					/*
 | |
| 					 * dictinary wants next word, so setup and store current
 | |
| 					 * position and go to multiword  mode
 | |
| 					 */
 | |
| 
 | |
| 					ld->curDictId = DatumGetObjectId(map->dict_id[i]);
 | |
| 					ld->posDict = i + 1;
 | |
| 					ld->curSub = curVal->next;
 | |
| 					if (res)
 | |
| 						setNewTmpRes(ld, curVal, res);
 | |
| 					return LexizeExec(ld, correspondLexem);
 | |
| 				}
 | |
| 
 | |
| 				if (!res)		/* dictionary doesn't know this lexeme */
 | |
| 					continue;
 | |
| 
 | |
| 				RemoveHead(ld);
 | |
| 				setCorrLex(ld, correspondLexem);
 | |
| 				return res;
 | |
| 			}
 | |
| 
 | |
| 			RemoveHead(ld);
 | |
| 		}
 | |
| 	}
 | |
| 	else
 | |
| 	{							/* curDictId is valid */
 | |
| 		dict = finddict(ld->curDictId);
 | |
| 
 | |
| 		/*
 | |
| 		 * Dictionary ld->curDictId asks  us about following words
 | |
| 		 */
 | |
| 
 | |
| 		while (ld->curSub)
 | |
| 		{
 | |
| 			ParsedLex  *curVal = ld->curSub;
 | |
| 
 | |
| 			map = ld->cfg->map + curVal->type;
 | |
| 
 | |
| 			if (curVal->type != 0)
 | |
| 			{
 | |
| 				bool		dictExists = false;
 | |
| 
 | |
| 				if (curVal->type >= ld->cfg->len || map->len == 0)
 | |
| 				{
 | |
| 					/* skip this type of lexeme */
 | |
| 					ld->curSub = curVal->next;
 | |
| 					continue;
 | |
| 				}
 | |
| 
 | |
| 				/*
 | |
| 				 * We should be sure that current type of lexeme is recognized
 | |
| 				 * by our dictinonary: we just check is it exist in list of
 | |
| 				 * dictionaries ?
 | |
| 				 */
 | |
| 				for (i = 0; i < map->len && !dictExists; i++)
 | |
| 					if (ld->curDictId == DatumGetObjectId(map->dict_id[i]))
 | |
| 						dictExists = true;
 | |
| 
 | |
| 				if (!dictExists)
 | |
| 				{
 | |
| 					/*
 | |
| 					 * Dictionary can't work with current tpe of lexeme,
 | |
| 					 * return to basic mode and redo all stored lexemes
 | |
| 					 */
 | |
| 					ld->curDictId = InvalidOid;
 | |
| 					return LexizeExec(ld, correspondLexem);
 | |
| 				}
 | |
| 			}
 | |
| 
 | |
| 			ld->dictState.isend = (curVal->type == 0) ? true : false;
 | |
| 			ld->dictState.getnext = false;
 | |
| 
 | |
| 			res = (TSLexeme *) DatumGetPointer(FunctionCall4(
 | |
| 														&(dict->lexize_info),
 | |
| 										   PointerGetDatum(dict->dictionary),
 | |
| 											   PointerGetDatum(curVal->lemm),
 | |
| 											  Int32GetDatum(curVal->lenlemm),
 | |
| 											  PointerGetDatum(&ld->dictState)
 | |
| 															 ));
 | |
| 
 | |
| 			if (ld->dictState.getnext)
 | |
| 			{
 | |
| 				/* Dictionary wants one more */
 | |
| 				ld->curSub = curVal->next;
 | |
| 				if (res)
 | |
| 					setNewTmpRes(ld, curVal, res);
 | |
| 				continue;
 | |
| 			}
 | |
| 
 | |
| 			if (res || ld->tmpRes)
 | |
| 			{
 | |
| 				/*
 | |
| 				 * Dictionary normalizes lexemes, so we remove from stack all
 | |
| 				 * used lexemes , return to basic mode and redo end of stack
 | |
| 				 * (if it exists)
 | |
| 				 */
 | |
| 				if (res)
 | |
| 				{
 | |
| 					moveToWaste(ld, ld->curSub);
 | |
| 				}
 | |
| 				else
 | |
| 				{
 | |
| 					res = ld->tmpRes;
 | |
| 					moveToWaste(ld, ld->lastRes);
 | |
| 				}
 | |
| 
 | |
| 				/* reset to initial state */
 | |
| 				ld->curDictId = InvalidOid;
 | |
| 				ld->posDict = 0;
 | |
| 				ld->lastRes = NULL;
 | |
| 				ld->tmpRes = NULL;
 | |
| 				setCorrLex(ld, correspondLexem);
 | |
| 				return res;
 | |
| 			}
 | |
| 
 | |
| 			/*
 | |
| 			 * Dict don't want next lexem and didn't recognize anything, redo
 | |
| 			 * from ld->towork.head
 | |
| 			 */
 | |
| 			ld->curDictId = InvalidOid;
 | |
| 			return LexizeExec(ld, correspondLexem);
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	setCorrLex(ld, correspondLexem);
 | |
| 	return NULL;
 | |
| }
 |