mirror of
https://github.com/postgres/postgres.git
synced 2025-04-22 23:02:54 +03:00
Allow do not lexize words in substitution.
Docs will be submitted some later, now it's at http://www.sai.msu.su/~megera/oddmuse/index.cgi/Thesaurus_dictionary
This commit is contained in:
parent
63e464a5e6
commit
92bcb5abe0
@ -1,4 +1,4 @@
|
|||||||
/* $PostgreSQL: pgsql/contrib/tsearch2/dict_thesaurus.c,v 1.4 2006/06/02 18:03:06 teodor Exp $ */
|
/* $PostgreSQL: pgsql/contrib/tsearch2/dict_thesaurus.c,v 1.5 2006/06/06 16:25:55 teodor Exp $ */
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* thesaurus
|
* thesaurus
|
||||||
@ -13,6 +13,11 @@
|
|||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include "ts_locale.h"
|
#include "ts_locale.h"
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Temporay we use TSLexeme.flags for inner use...
|
||||||
|
*/
|
||||||
|
#define DT_USEASIS 0x1000
|
||||||
|
|
||||||
typedef struct LexemeInfo {
|
typedef struct LexemeInfo {
|
||||||
uint16 idsubst; /* entry's number in DictThesaurus->subst */
|
uint16 idsubst; /* entry's number in DictThesaurus->subst */
|
||||||
uint16 posinsubst; /* pos info in entry */
|
uint16 posinsubst; /* pos info in entry */
|
||||||
@ -94,7 +99,7 @@ newLexeme( DictThesaurus *d, char *b, char *e, uint16 idsubst, uint16 posinsubst
|
|||||||
}
|
}
|
||||||
|
|
||||||
static void
|
static void
|
||||||
addWrd( DictThesaurus *d, char *b, char *e, uint16 idsubst, uint16 nwrd, uint16 posinsubst ) {
|
addWrd( DictThesaurus *d, char *b, char *e, uint16 idsubst, uint16 nwrd, uint16 posinsubst, bool useasis ) {
|
||||||
static int nres=0;
|
static int nres=0;
|
||||||
static int ntres = 0;
|
static int ntres = 0;
|
||||||
TheSubstitute *ptr;
|
TheSubstitute *ptr;
|
||||||
@ -138,7 +143,10 @@ addWrd( DictThesaurus *d, char *b, char *e, uint16 idsubst, uint16 nwrd, uint16
|
|||||||
ptr->res[ nres ].lexeme[e-b] = '\0';
|
ptr->res[ nres ].lexeme[e-b] = '\0';
|
||||||
|
|
||||||
ptr->res[ nres ].nvariant = nwrd;
|
ptr->res[ nres ].nvariant = nwrd;
|
||||||
ptr->res[ nres ].flags = TSL_ADDPOS;
|
if ( useasis )
|
||||||
|
ptr->res[ nres ].flags = DT_USEASIS;
|
||||||
|
else
|
||||||
|
ptr->res[ nres ].flags = 0;
|
||||||
|
|
||||||
ptr->res[ ++nres ].lexeme = NULL;
|
ptr->res[ ++nres ].lexeme = NULL;
|
||||||
}
|
}
|
||||||
@ -154,6 +162,7 @@ thesaurusRead( char *filename, DictThesaurus *d ) {
|
|||||||
char str[BUFSIZ];
|
char str[BUFSIZ];
|
||||||
int lineno=0;
|
int lineno=0;
|
||||||
uint16 idsubst = 0;
|
uint16 idsubst = 0;
|
||||||
|
bool useasis=false;
|
||||||
|
|
||||||
fh = fopen(to_absfilename(filename), "r");
|
fh = fopen(to_absfilename(filename), "r");
|
||||||
if (!fh)
|
if (!fh)
|
||||||
@ -196,13 +205,24 @@ thesaurusRead( char *filename, DictThesaurus *d ) {
|
|||||||
state = TR_WAITLEX;
|
state = TR_WAITLEX;
|
||||||
}
|
}
|
||||||
} else if ( state == TR_WAITSUBS ) {
|
} else if ( state == TR_WAITSUBS ) {
|
||||||
if ( !t_isspace(ptr) ) {
|
if ( t_iseq(ptr, '*') ) {
|
||||||
|
useasis = true;
|
||||||
|
state = TR_INSUBS;
|
||||||
|
beginwrd = ptr + pg_mblen(ptr);
|
||||||
|
} else if ( t_iseq(ptr, '\\') ) {
|
||||||
|
useasis = false;
|
||||||
|
state = TR_INSUBS;
|
||||||
|
beginwrd = ptr + pg_mblen(ptr);
|
||||||
|
} else if ( !t_isspace(ptr) ) {
|
||||||
|
useasis = false;
|
||||||
beginwrd = ptr;
|
beginwrd = ptr;
|
||||||
state = TR_INSUBS;
|
state = TR_INSUBS;
|
||||||
}
|
}
|
||||||
} else if ( state == TR_INSUBS ) {
|
} else if ( state == TR_INSUBS ) {
|
||||||
if ( t_isspace(ptr) ) {
|
if ( t_isspace(ptr) ) {
|
||||||
addWrd( d, beginwrd, ptr, idsubst, nwrd++, posinsubst );
|
if ( ptr == beginwrd )
|
||||||
|
elog(ERROR, "Thesaurus: Unexpected end of line or lexeme at %d line", lineno);
|
||||||
|
addWrd( d, beginwrd, ptr, idsubst, nwrd++, posinsubst, useasis );
|
||||||
state = TR_WAITSUBS;
|
state = TR_WAITSUBS;
|
||||||
}
|
}
|
||||||
} else
|
} else
|
||||||
@ -211,8 +231,11 @@ thesaurusRead( char *filename, DictThesaurus *d ) {
|
|||||||
ptr += pg_mblen(ptr);
|
ptr += pg_mblen(ptr);
|
||||||
}
|
}
|
||||||
|
|
||||||
if ( state == TR_INSUBS )
|
if ( state == TR_INSUBS ) {
|
||||||
addWrd( d, beginwrd, ptr, idsubst, nwrd++, posinsubst );
|
if ( ptr == beginwrd )
|
||||||
|
elog(ERROR, "Thesaurus: Unexpected end of line or lexeme at %d line", lineno);
|
||||||
|
addWrd( d, beginwrd, ptr, idsubst, nwrd++, posinsubst, useasis );
|
||||||
|
}
|
||||||
|
|
||||||
idsubst++;
|
idsubst++;
|
||||||
|
|
||||||
@ -319,7 +342,9 @@ compileTheLexeme(DictThesaurus *d) {
|
|||||||
elog(ERROR,"Out of memory");
|
elog(ERROR,"Out of memory");
|
||||||
|
|
||||||
for(i=0;i<d->nwrds;i++) {
|
for(i=0;i<d->nwrds;i++) {
|
||||||
TSLexeme *ptr = (TSLexeme*) DatumGetPointer(
|
TSLexeme *ptr;
|
||||||
|
|
||||||
|
ptr = (TSLexeme*) DatumGetPointer(
|
||||||
FunctionCall4(
|
FunctionCall4(
|
||||||
&(d->subdict.lexize_info),
|
&(d->subdict.lexize_info),
|
||||||
PointerGetDatum(d->subdict.dictionary),
|
PointerGetDatum(d->subdict.dictionary),
|
||||||
@ -331,9 +356,11 @@ compileTheLexeme(DictThesaurus *d) {
|
|||||||
|
|
||||||
if ( !(ptr && ptr->lexeme) ) {
|
if ( !(ptr && ptr->lexeme) ) {
|
||||||
if ( !ptr )
|
if ( !ptr )
|
||||||
elog(ERROR,"Thesaurus: word '%s' isn't recognized by subdictionary", d->wrds[i].lexeme);
|
elog(ERROR,"Thesaurus: word-sample '%s' isn't recognized by subdictionary (rule %d)",
|
||||||
|
d->wrds[i].lexeme, d->wrds[i].entries->idsubst+1 );
|
||||||
else
|
else
|
||||||
elog(NOTICE,"Thesaurus: word '%s' is recognized as stop-word, assign any stop-word", d->wrds[i].lexeme);
|
elog(NOTICE,"Thesaurus: word-sample '%s' is recognized as stop-word, assign any stop-word (rule %d)",
|
||||||
|
d->wrds[i].lexeme, d->wrds[i].entries->idsubst+1);
|
||||||
|
|
||||||
newwrds = addCompiledLexeme( newwrds, &nnw, &tnm, NULL, d->wrds[i].entries, 0);
|
newwrds = addCompiledLexeme( newwrds, &nnw, &tnm, NULL, d->wrds[i].entries, 0);
|
||||||
} else {
|
} else {
|
||||||
@ -413,7 +440,15 @@ compileTheSubstitute(DictThesaurus *d) {
|
|||||||
inptr = rem;
|
inptr = rem;
|
||||||
|
|
||||||
while( inptr && inptr->lexeme ) {
|
while( inptr && inptr->lexeme ) {
|
||||||
TSLexeme *reml, *lexized = (TSLexeme*) DatumGetPointer(
|
TSLexeme *lexized, tmplex[2];
|
||||||
|
|
||||||
|
if ( inptr->flags & DT_USEASIS ) { /* do not lexize */
|
||||||
|
tmplex[0] = *inptr;
|
||||||
|
tmplex[0].flags = 0;
|
||||||
|
tmplex[1].lexeme = NULL;
|
||||||
|
lexized = tmplex;
|
||||||
|
} else {
|
||||||
|
lexized = (TSLexeme*) DatumGetPointer(
|
||||||
FunctionCall4(
|
FunctionCall4(
|
||||||
&(d->subdict.lexize_info),
|
&(d->subdict.lexize_info),
|
||||||
PointerGetDatum(d->subdict.dictionary),
|
PointerGetDatum(d->subdict.dictionary),
|
||||||
@ -422,8 +457,8 @@ compileTheSubstitute(DictThesaurus *d) {
|
|||||||
PointerGetDatum(NULL)
|
PointerGetDatum(NULL)
|
||||||
)
|
)
|
||||||
);
|
);
|
||||||
|
}
|
||||||
|
|
||||||
reml = lexized;
|
|
||||||
if ( lexized && lexized->lexeme ) {
|
if ( lexized && lexized->lexeme ) {
|
||||||
int toset = (lexized->lexeme && outptr != d->subst[i].res ) ? (outptr - d->subst[i].res) : -1;
|
int toset = (lexized->lexeme && outptr != d->subst[i].res ) ? (outptr - d->subst[i].res) : -1;
|
||||||
|
|
||||||
@ -447,8 +482,10 @@ compileTheSubstitute(DictThesaurus *d) {
|
|||||||
|
|
||||||
if ( toset > 0)
|
if ( toset > 0)
|
||||||
d->subst[i].res[toset].flags |= TSL_ADDPOS;
|
d->subst[i].res[toset].flags |= TSL_ADDPOS;
|
||||||
|
} else if ( lexized ) {
|
||||||
|
elog(NOTICE,"Thesaurus: word '%s' in substition is a stop-word, ignored (rule %d)", inptr->lexeme, i+1);
|
||||||
} else {
|
} else {
|
||||||
elog(NOTICE,"Thesaurus: word '%s' isn't recognized by subdictionary or it's a stop-word, ignored", inptr->lexeme);
|
elog(ERROR,"Thesaurus: word '%s' in substition isn't recognized (rule %d)", inptr->lexeme, i+1);
|
||||||
}
|
}
|
||||||
|
|
||||||
if ( inptr->lexeme )
|
if ( inptr->lexeme )
|
||||||
@ -457,7 +494,7 @@ compileTheSubstitute(DictThesaurus *d) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if ( outptr == d->subst[i].res )
|
if ( outptr == d->subst[i].res )
|
||||||
elog(ERROR,"Thesaurus: all words in subsitution aren't recognized by subdictionary");
|
elog(ERROR,"Thesaurus: all words in subsitution are stop word (rule %d)", i+1);
|
||||||
|
|
||||||
d->subst[i].reslen = outptr - d->subst[i].res;
|
d->subst[i].reslen = outptr - d->subst[i].res;
|
||||||
|
|
||||||
|
@ -1,14 +1,16 @@
|
|||||||
#
|
#
|
||||||
# Theasurus config file. Character ':' splits
|
# Theasurus config file. Character ':' splits
|
||||||
# string to part:
|
# string to part, example:
|
||||||
# to be substituted string
|
# sample-words : substitute-words
|
||||||
# substituting string
|
|
||||||
#
|
#
|
||||||
|
# Any substitute-word can be marked by preceding '*' character,
|
||||||
|
# which means do not lexize this word
|
||||||
|
# Docs: http://www.sai.msu.su/~megera/oddmuse/index.cgi/Thesaurus_dictionary
|
||||||
|
|
||||||
#one two three : 123
|
#one two three : *123
|
||||||
#one two : 12
|
#one two : *12
|
||||||
#one : 1
|
#one : *1
|
||||||
#two : 2
|
#two : *2
|
||||||
|
|
||||||
#foo bar : blah blah
|
#foo bar : blah blah
|
||||||
#f bar : fbar
|
#f bar : fbar
|
||||||
|
Loading…
x
Reference in New Issue
Block a user