mirror of
				https://github.com/postgres/postgres.git
				synced 2025-10-31 10:30:33 +03:00 
			
		
		
		
	It required some changes in lexize algorithm, but interface with dictionaries stays compatible with old dictionaries. Funded by Georgia Public Library Service and LibLime, Inc.
		
			
				
	
	
		
			262 lines
		
	
	
		
			5.8 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
			
		
		
	
	
			262 lines
		
	
	
		
			5.8 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
| /*
 | |
|  * lexize stream of lexemes 
 | |
|  * Teodor Sigaev <teodor@sigaev.ru>
 | |
|  */
 | |
| #include "postgres.h"
 | |
| 
 | |
| #include <ctype.h>
 | |
| #include <locale.h>
 | |
| 
 | |
| #include "ts_cfg.h"
 | |
| #include "dict.h"
 | |
| 
 | |
| void
 | |
| LexizeInit(LexizeData *ld, TSCfgInfo *cfg) {
 | |
| 	ld->cfg = cfg;
 | |
| 	ld->curDictId = InvalidOid;
 | |
| 	ld->posDict = 0;
 | |
| 	ld->towork.head = ld->towork.tail = ld->curSub = NULL;
 | |
| 	ld->waste.head = ld->waste.tail = NULL;
 | |
| 	ld->lastRes=NULL;
 | |
| 	ld->tmpRes=NULL;
 | |
| }
 | |
| 
 | |
| static void
 | |
| LPLAddTail(ListParsedLex *list, ParsedLex *newpl) {
 | |
| 	if ( list->tail ) {
 | |
| 		list->tail->next = newpl;
 | |
| 		list->tail = newpl;
 | |
| 	} else
 | |
| 		list->head = list->tail = newpl;
 | |
| 	newpl->next = NULL;
 | |
| }
 | |
| 
 | |
| static ParsedLex*
 | |
| LPLRemoveHead(ListParsedLex *list) {
 | |
| 	ParsedLex *res = list->head;
 | |
| 
 | |
| 	if ( list->head ) 
 | |
| 		list->head = list->head->next;
 | |
| 
 | |
| 	if ( list->head == NULL )
 | |
| 		list->tail = NULL;
 | |
| 
 | |
| 	return res;
 | |
| }
 | |
| 
 | |
| 
 | |
| void
 | |
| LexizeAddLemm(LexizeData *ld, int type, char *lemm, int lenlemm) {
 | |
| 	ParsedLex *newpl = (ParsedLex*)palloc( sizeof(ParsedLex) );
 | |
| 
 | |
| 	newpl = (ParsedLex*)palloc( sizeof(ParsedLex) );
 | |
| 	newpl->type = type;
 | |
| 	newpl->lemm = lemm;
 | |
| 	newpl->lenlemm = lenlemm;
 | |
| 	LPLAddTail(&ld->towork, newpl);
 | |
| 	ld->curSub = ld->towork.tail;
 | |
| }
 | |
| 
 | |
| static void
 | |
| RemoveHead(LexizeData *ld) {
 | |
| 	LPLAddTail(&ld->waste, LPLRemoveHead(&ld->towork));
 | |
| 
 | |
| 	ld->posDict = 0;
 | |
| }
 | |
| 
 | |
| static void
 | |
| setCorrLex(LexizeData *ld, ParsedLex **correspondLexem) {
 | |
| 	if ( correspondLexem ) {
 | |
| 		*correspondLexem = ld->waste.head;
 | |
| 	} else {
 | |
| 		ParsedLex	*tmp, *ptr = ld->waste.head;
 | |
| 
 | |
| 		while(ptr) {
 | |
| 			tmp = ptr->next;
 | |
| 			pfree(ptr);
 | |
| 			ptr = tmp;
 | |
| 		}
 | |
| 	}
 | |
| 	ld->waste.head = ld->waste.tail = NULL;
 | |
| }
 | |
| 
 | |
| static void
 | |
| moveToWaste(LexizeData *ld, ParsedLex *stop) {
 | |
| 	bool	go = true;
 | |
| 
 | |
| 	while( ld->towork.head && go) {
 | |
| 		if (ld->towork.head == stop) {
 | |
| 			ld->curSub = stop->next;
 | |
| 			go = false;
 | |
| 		}
 | |
| 		RemoveHead(ld);
 | |
| 	}
 | |
| }
 | |
| 
 | |
| static void
 | |
| setNewTmpRes(LexizeData *ld, ParsedLex *lex, TSLexeme *res) {
 | |
| 	if ( ld->tmpRes ) {
 | |
| 		TSLexeme	*ptr;
 | |
| 		for( ptr=ld->tmpRes; ptr->lexeme; ptr++ ) 
 | |
| 			pfree( ptr->lexeme );
 | |
| 		pfree( ld->tmpRes );
 | |
| 	}
 | |
| 	ld->tmpRes = res;
 | |
| 	ld->lastRes = lex;
 | |
| }
 | |
| 
 | |
| TSLexeme*
 | |
| LexizeExec(LexizeData *ld, ParsedLex **correspondLexem) {
 | |
| 	int i;
 | |
| 	ListDictionary	*map;
 | |
| 	DictInfo *dict;
 | |
| 	TSLexeme	*res;
 | |
| 
 | |
| 	if ( ld->curDictId == InvalidOid ) {
 | |
| 		/* 
 | |
| 		 * usial mode: dictionary wants only one word,
 | |
| 		 * but we should keep in mind that we should go through
 | |
| 		 * all stack
 | |
| 		 */
 | |
| 
 | |
| 		while( ld->towork.head ) {
 | |
| 			ParsedLex	*curVal = ld->towork.head;
 | |
| 
 | |
| 			map = ld->cfg->map + curVal->type;
 | |
| 
 | |
| 			if (curVal->type == 0 || curVal->type >= ld->cfg->len || map->len == 0 ) {	
 | |
| 				/* skip this type of lexeme */
 | |
| 				RemoveHead(ld);
 | |
| 				continue;
 | |
| 			}
 | |
| 
 | |
| 			for (i = ld->posDict; i < map->len; i++) {
 | |
| 				dict = finddict(DatumGetObjectId(map->dict_id[i]));
 | |
| 
 | |
| 				ld->dictState.isend = ld->dictState.getnext = false;
 | |
| 				ld->dictState.private = NULL;
 | |
| 				res = (TSLexeme *) DatumGetPointer( FunctionCall4(
 | |
| 													&(dict->lexize_info),
 | |
| 									   				PointerGetDatum(dict->dictionary),
 | |
| 												   	PointerGetDatum(curVal->lemm),
 | |
| 												 	Int32GetDatum(curVal->lenlemm),
 | |
| 													PointerGetDatum(&ld->dictState)
 | |
| 										 ));
 | |
| 
 | |
| 				if ( ld->dictState.getnext ) {
 | |
| 					/* 
 | |
| 					 * dictinary wants next word, so setup and store
 | |
| 					 * current position and go to multiword  mode
 | |
| 					 */
 | |
| 					 
 | |
| 					ld->curDictId = DatumGetObjectId(map->dict_id[i]);
 | |
| 					ld->posDict = i+1;
 | |
| 					ld->curSub = curVal->next;
 | |
| 					if ( res )
 | |
| 						setNewTmpRes(ld, curVal, res);
 | |
| 					return LexizeExec(ld, correspondLexem);
 | |
| 				}
 | |
| 
 | |
| 				if (!res)			/* dictionary doesn't know this lexeme */
 | |
| 					continue;
 | |
| 				
 | |
| 				RemoveHead(ld);
 | |
| 				setCorrLex(ld, correspondLexem);
 | |
| 				return res;
 | |
| 			}
 | |
| 
 | |
| 			RemoveHead(ld);
 | |
| 		} 
 | |
| 	} else { /* curDictId is valid */
 | |
| 		dict = finddict(ld->curDictId);
 | |
| 		
 | |
| 		/*
 | |
| 		 * Dictionary ld->curDictId asks  us about following words
 | |
| 		 */
 | |
| 
 | |
| 		while( ld->curSub ) {
 | |
| 			ParsedLex	*curVal = ld->curSub;
 | |
| 
 | |
| 			map = ld->cfg->map + curVal->type;
 | |
| 
 | |
| 			if (curVal->type != 0) {
 | |
| 				bool dictExists = false;
 | |
| 
 | |
| 				if (curVal->type >= ld->cfg->len || map->len == 0 ) {	
 | |
| 					/* skip this type of lexeme */
 | |
| 					ld->curSub = curVal->next;
 | |
| 					continue;
 | |
| 				}
 | |
| 
 | |
| 				/*
 | |
| 				 * We should be sure that current type of lexeme is recognized by
 | |
| 				 * our dictinonary: we just check is it exist in 
 | |
| 				 * list of dictionaries ?
 | |
| 				 */
 | |
| 				for(i=0;i < map->len && !dictExists; i++) 
 | |
| 					if ( ld->curDictId == DatumGetObjectId(map->dict_id[i]) )
 | |
| 						dictExists = true;
 | |
| 
 | |
| 				if ( !dictExists ) {
 | |
| 					/*
 | |
| 					 * Dictionary can't work with current tpe of lexeme,
 | |
| 					 * return to basic mode and redo all stored lexemes
 | |
| 					 */
 | |
| 					ld->curDictId = InvalidOid;
 | |
| 					return LexizeExec(ld, correspondLexem);
 | |
| 				}
 | |
| 			} 
 | |
| 	
 | |
| 			ld->dictState.isend = (curVal->type==0) ? true : false;
 | |
| 			ld->dictState.getnext = false;
 | |
| 
 | |
| 			res = (TSLexeme *) DatumGetPointer( FunctionCall4(
 | |
| 												&(dict->lexize_info),
 | |
| 								   				PointerGetDatum(dict->dictionary),
 | |
| 											   	PointerGetDatum(curVal->lemm),
 | |
| 											 	Int32GetDatum(curVal->lenlemm),
 | |
| 												PointerGetDatum(&ld->dictState)
 | |
| 										 ));
 | |
| 
 | |
| 			if ( ld->dictState.getnext ) {
 | |
| 				/* Dictionary wants one more */
 | |
| 				ld->curSub = curVal->next;
 | |
| 				if ( res )
 | |
| 					setNewTmpRes(ld, curVal, res);
 | |
| 				continue;
 | |
| 			}
 | |
| 
 | |
| 			if ( res || ld->tmpRes ) {
 | |
| 				/*
 | |
| 				 * Dictionary normalizes lexemes,
 | |
| 				 * so we remove from stack all used lexemes ,
 | |
| 				 * return to basic mode and redo end of stack (if it exists)
 | |
| 				 */
 | |
| 				if ( res ) {
 | |
| 					moveToWaste( ld, ld->curSub );
 | |
| 				} else {
 | |
| 					res = ld->tmpRes;
 | |
| 					moveToWaste( ld, ld->lastRes );
 | |
| 				}
 | |
| 
 | |
| 				/* reset to initial state */
 | |
| 				ld->curDictId = InvalidOid;
 | |
| 				ld->posDict = 0;
 | |
| 				ld->lastRes = NULL;
 | |
| 				ld->tmpRes = NULL;
 | |
| 				setCorrLex(ld, correspondLexem);
 | |
| 				return res;
 | |
| 			}
 | |
| 
 | |
| 			/* Dict don't want next lexem and didn't recognize anything,
 | |
| 			   redo from ld->towork.head */
 | |
| 			ld->curDictId = InvalidOid;
 | |
| 			return LexizeExec(ld, correspondLexem);
 | |
| 		}	
 | |
| 	}
 | |
| 
 | |
| 	setCorrLex(ld, correspondLexem);
 | |
| 	return NULL;
 | |
| }
 | |
| 
 |