mirror of
				https://github.com/postgres/postgres.git
				synced 2025-10-19 15:49:24 +03:00 
			
		
		
		
	Allow do not lexize words in substitution.
Docs will be submitted some later, now it's at http://www.sai.msu.su/~megera/oddmuse/index.cgi/Thesaurus_dictionary
This commit is contained in:
		| @@ -1,4 +1,4 @@ | ||||
| /* $PostgreSQL: pgsql/contrib/tsearch2/dict_thesaurus.c,v 1.4 2006/06/02 18:03:06 teodor Exp $ */ | ||||
| /* $PostgreSQL: pgsql/contrib/tsearch2/dict_thesaurus.c,v 1.5 2006/06/06 16:25:55 teodor Exp $ */ | ||||
|  | ||||
| /* | ||||
|  * thesaurus | ||||
| @@ -13,6 +13,11 @@ | ||||
| #include "common.h" | ||||
| #include "ts_locale.h" | ||||
|  | ||||
| /* | ||||
|  * Temporay we use TSLexeme.flags for inner use... | ||||
|  */ | ||||
| #define	DT_USEASIS		0x1000 | ||||
|  | ||||
| typedef struct LexemeInfo { | ||||
| 	uint16	idsubst; /* entry's number in DictThesaurus->subst */ | ||||
| 	uint16	posinsubst; /* pos info in entry */ | ||||
| @@ -94,7 +99,7 @@ newLexeme( DictThesaurus *d, char *b, char *e, uint16 idsubst, uint16 posinsubst | ||||
| } | ||||
|  | ||||
| static void | ||||
| addWrd( DictThesaurus *d, char *b, char *e, uint16 idsubst, uint16 nwrd, uint16 posinsubst ) { | ||||
| addWrd( DictThesaurus *d, char *b, char *e, uint16 idsubst, uint16 nwrd, uint16 posinsubst, bool useasis ) { | ||||
| 	static	int nres=0; | ||||
| 	static  int ntres = 0; | ||||
| 	TheSubstitute	*ptr; | ||||
| @@ -138,7 +143,10 @@ addWrd( DictThesaurus *d, char *b, char *e, uint16 idsubst, uint16 nwrd, uint16 | ||||
| 	ptr->res[ nres ].lexeme[e-b] = '\0'; | ||||
|  | ||||
| 	ptr->res[ nres ].nvariant = nwrd; | ||||
| 	ptr->res[ nres ].flags = TSL_ADDPOS; | ||||
| 	if ( useasis ) | ||||
| 		ptr->res[ nres ].flags = DT_USEASIS; | ||||
| 	else | ||||
| 		ptr->res[ nres ].flags = 0; | ||||
|  | ||||
| 	ptr->res[ ++nres ].lexeme = NULL; | ||||
| } | ||||
| @@ -154,6 +162,7 @@ thesaurusRead( char *filename, DictThesaurus *d ) { | ||||
| 	char str[BUFSIZ]; | ||||
| 	int lineno=0; | ||||
| 	uint16	idsubst = 0; | ||||
| 	bool	useasis=false; | ||||
|  | ||||
| 	fh = fopen(to_absfilename(filename), "r"); | ||||
| 	if (!fh) | ||||
| @@ -196,13 +205,24 @@ thesaurusRead( char *filename, DictThesaurus *d ) { | ||||
| 					state = TR_WAITLEX; | ||||
| 				} | ||||
| 			} else if ( state == TR_WAITSUBS ) { | ||||
| 				if ( !t_isspace(ptr) ) {  | ||||
| 				if ( t_iseq(ptr, '*') ) { | ||||
| 					useasis = true; | ||||
| 					state = TR_INSUBS; | ||||
| 					beginwrd = ptr + pg_mblen(ptr); | ||||
| 				} else if ( t_iseq(ptr, '\\') ) { | ||||
| 					useasis = false; | ||||
| 					state = TR_INSUBS; | ||||
| 					beginwrd = ptr + pg_mblen(ptr); | ||||
| 				} else if ( !t_isspace(ptr) ) {  | ||||
| 					useasis = false; | ||||
| 					beginwrd = ptr; | ||||
| 					state = TR_INSUBS; | ||||
| 				} | ||||
| 			} else if ( state == TR_INSUBS ) { | ||||
| 				if ( t_isspace(ptr) ) {  | ||||
| 					addWrd( d, beginwrd, ptr, idsubst, nwrd++, posinsubst ); | ||||
| 					if ( ptr == beginwrd ) | ||||
| 						elog(ERROR, "Thesaurus: Unexpected end of line or lexeme at %d line", lineno); | ||||
| 					addWrd( d, beginwrd, ptr, idsubst, nwrd++, posinsubst, useasis ); | ||||
| 					state = TR_WAITSUBS; | ||||
| 				} | ||||
| 			} else | ||||
| @@ -211,8 +231,11 @@ thesaurusRead( char *filename, DictThesaurus *d ) { | ||||
| 			ptr += pg_mblen(ptr); | ||||
| 		} | ||||
|  | ||||
| 		if ( state == TR_INSUBS ) | ||||
| 			addWrd( d, beginwrd, ptr, idsubst, nwrd++, posinsubst ); | ||||
| 		if ( state == TR_INSUBS ) { | ||||
| 			if ( ptr == beginwrd ) | ||||
| 				elog(ERROR, "Thesaurus: Unexpected end of line or lexeme at %d line", lineno); | ||||
| 			addWrd( d, beginwrd, ptr, idsubst, nwrd++, posinsubst, useasis ); | ||||
| 		} | ||||
|  | ||||
| 		idsubst++; | ||||
|  | ||||
| @@ -319,7 +342,9 @@ compileTheLexeme(DictThesaurus *d) { | ||||
| 		elog(ERROR,"Out of memory"); | ||||
|  | ||||
| 	for(i=0;i<d->nwrds;i++) { | ||||
| 		TSLexeme *ptr = (TSLexeme*) DatumGetPointer(  | ||||
| 		TSLexeme *ptr; | ||||
|  | ||||
| 		ptr = (TSLexeme*) DatumGetPointer(  | ||||
| 				FunctionCall4( | ||||
| 					&(d->subdict.lexize_info), | ||||
| 					PointerGetDatum(d->subdict.dictionary), | ||||
| @@ -331,9 +356,11 @@ compileTheLexeme(DictThesaurus *d) { | ||||
|  | ||||
| 		if ( !(ptr && ptr->lexeme) ) { | ||||
| 			if ( !ptr ) | ||||
| 				elog(ERROR,"Thesaurus: word '%s' isn't recognized by subdictionary", d->wrds[i].lexeme); | ||||
| 				elog(ERROR,"Thesaurus: word-sample '%s' isn't recognized by subdictionary (rule %d)",  | ||||
| 						d->wrds[i].lexeme, d->wrds[i].entries->idsubst+1	); | ||||
| 			else | ||||
| 				elog(NOTICE,"Thesaurus: word '%s' is recognized as stop-word, assign any stop-word", d->wrds[i].lexeme); | ||||
| 				elog(NOTICE,"Thesaurus: word-sample '%s' is recognized as stop-word, assign any stop-word (rule %d)",  | ||||
| 					d->wrds[i].lexeme,  d->wrds[i].entries->idsubst+1); | ||||
|  | ||||
| 			newwrds = addCompiledLexeme( newwrds, &nnw, &tnm, NULL, d->wrds[i].entries, 0); | ||||
| 		} else { | ||||
| @@ -413,17 +440,25 @@ compileTheSubstitute(DictThesaurus *d) { | ||||
| 		inptr = rem; | ||||
|  | ||||
| 		while( inptr && inptr->lexeme ) {  | ||||
| 			TSLexeme	*reml, *lexized = (TSLexeme*) DatumGetPointer(  | ||||
| 				FunctionCall4( | ||||
| 					&(d->subdict.lexize_info), | ||||
| 					PointerGetDatum(d->subdict.dictionary), | ||||
| 					PointerGetDatum(inptr->lexeme), | ||||
| 					Int32GetDatum(strlen(inptr->lexeme)), | ||||
| 					PointerGetDatum(NULL) | ||||
| 				) | ||||
| 			); | ||||
| 			TSLexeme	*lexized, tmplex[2]; | ||||
|  | ||||
| 			if ( inptr->flags & DT_USEASIS ) { /* do not lexize */ | ||||
| 				tmplex[0] = *inptr; | ||||
| 				tmplex[0].flags = 0;  | ||||
| 				tmplex[1].lexeme = NULL; | ||||
| 				lexized = tmplex;  | ||||
| 			} else { | ||||
| 				lexized = (TSLexeme*) DatumGetPointer(  | ||||
| 					FunctionCall4( | ||||
| 						&(d->subdict.lexize_info), | ||||
| 						PointerGetDatum(d->subdict.dictionary), | ||||
| 						PointerGetDatum(inptr->lexeme), | ||||
| 						Int32GetDatum(strlen(inptr->lexeme)), | ||||
| 						PointerGetDatum(NULL) | ||||
| 					) | ||||
| 				); | ||||
| 			} | ||||
|  | ||||
| 			reml = lexized; | ||||
| 			if ( lexized && lexized->lexeme ) { | ||||
| 				int toset = (lexized->lexeme && outptr != d->subst[i].res ) ? (outptr - d->subst[i].res)  : -1; | ||||
|  | ||||
| @@ -447,8 +482,10 @@ compileTheSubstitute(DictThesaurus *d) { | ||||
|  | ||||
| 				if ( toset > 0) | ||||
| 					d->subst[i].res[toset].flags |= TSL_ADDPOS; | ||||
| 			} else if ( lexized ) { | ||||
| 				elog(NOTICE,"Thesaurus: word '%s' in substition is a stop-word, ignored (rule %d)", inptr->lexeme, i+1); | ||||
| 			} else { | ||||
| 				elog(NOTICE,"Thesaurus: word '%s' isn't recognized by subdictionary or it's a stop-word, ignored", inptr->lexeme); | ||||
| 				elog(ERROR,"Thesaurus: word '%s' in substition isn't recognized (rule %d)", inptr->lexeme, i+1); | ||||
| 			} | ||||
|  | ||||
| 			if ( inptr->lexeme ) | ||||
| @@ -457,7 +494,7 @@ compileTheSubstitute(DictThesaurus *d) { | ||||
| 		} | ||||
|  | ||||
| 		if ( outptr == d->subst[i].res ) | ||||
| 			elog(ERROR,"Thesaurus: all words in subsitution aren't recognized by subdictionary"); | ||||
| 			elog(ERROR,"Thesaurus: all words in subsitution are stop word (rule %d)", i+1); | ||||
|  | ||||
| 		d->subst[i].reslen = outptr - d->subst[i].res; | ||||
|  | ||||
| @@ -717,7 +754,7 @@ thesaurus_lexize(PG_FUNCTION_ARGS) | ||||
|  | ||||
| 			infos = (LexemeInfo**)palloc(sizeof(LexemeInfo*)*nlex); | ||||
| 			for(i=0;i<nlex;i++)  | ||||
| 				if ( (infos[i] = findTheLexeme(d, basevar[i].lexeme)) == NULL ) | ||||
| 				if ( (infos[i] = findTheLexeme(d, basevar[i].lexeme)) == NULL )  | ||||
| 					break; | ||||
|  | ||||
| 			if ( i<nlex ) {  | ||||
|   | ||||
| @@ -1,14 +1,16 @@ | ||||
| # | ||||
| # Theasurus config file. Character ':' splits | ||||
| # string to part:  | ||||
| #     to be substituted string | ||||
| #     substituting string | ||||
| # string to part, example:  | ||||
| # sample-words : substitute-words | ||||
| # | ||||
| # Any substitute-word can be marked by preceding '*' character, | ||||
| # which means do not lexize this word | ||||
| # Docs: http://www.sai.msu.su/~megera/oddmuse/index.cgi/Thesaurus_dictionary | ||||
|  | ||||
| #one two three : 123 | ||||
| #one two : 12 | ||||
| #one : 1 | ||||
| #two : 2 | ||||
| #one two three : *123 | ||||
| #one two : *12 | ||||
| #one : *1 | ||||
| #two : *2 | ||||
|  | ||||
| #foo bar : blah blah | ||||
| #f   bar : fbar | ||||
|   | ||||
		Reference in New Issue
	
	Block a user