1
0
mirror of https://github.com/postgres/postgres.git synced 2025-07-31 22:04:40 +03:00

improve support of agglutinative languages (query with compound words).

regression=# select to_tsquery( '\'fotballklubber\'');
                   to_tsquery
------------------------------------------------
 'fotball' & 'klubb' | 'fot' & 'ball' & 'klubb'
(1 row)

So, changed interface to dictionaries, lexize method of dictionary shoud return
pointer to aray of TSLexeme structs instead of char**. Last element should
have TSLexeme->lexeme == NULL.

typedef struct {
        /* number of variant of split word , for example
                Word 'fotballklubber' (norwegian) has two varian to split:
                ( fotball, klubb ) and ( fot, ball, klubb ). So, dictionary
                should return:
                nvariant        lexeme
                1               fotball
                1               klubb
                2               fot
                2               ball
                2               klubb

        */
        uint16  nvariant;

        /* currently unused */
        uint16  flags;

        /* C-string */
        char    *lexeme;
} TSLexeme;
This commit is contained in:
Teodor Sigaev
2005-01-25 15:24:38 +00:00
parent d314616d12
commit 324300bc7c
12 changed files with 146 additions and 85 deletions

View File

@ -321,10 +321,10 @@ parsetext_v2(TSCfgInfo * cfg, PRSTEXT * prs, char *buf, int4 buflen)
for (i = 0; i < cfg->map[type].len; i++)
{
DictInfo *dict = finddict(DatumGetObjectId(cfg->map[type].dict_id[i]));
char **norms,
**ptr;
TSLexeme *norms,
*ptr;
norms = ptr = (char **) DatumGetPointer(
norms = ptr = (TSLexeme *) DatumGetPointer(
FunctionCall3(
&(dict->lexize_info),
PointerGetDatum(dict->dictionary),
@ -337,7 +337,7 @@ parsetext_v2(TSCfgInfo * cfg, PRSTEXT * prs, char *buf, int4 buflen)
prs->pos++; /* set pos */
while (*ptr)
while (ptr->lexeme)
{
if (prs->curwords == prs->lenwords)
{
@ -345,8 +345,9 @@ parsetext_v2(TSCfgInfo * cfg, PRSTEXT * prs, char *buf, int4 buflen)
prs->words = (TSWORD *) repalloc((void *) prs->words, prs->lenwords * sizeof(TSWORD));
}
prs->words[prs->curwords].len = strlen(*ptr);
prs->words[prs->curwords].word = *ptr;
prs->words[prs->curwords].len = strlen(ptr->lexeme);
prs->words[prs->curwords].word = ptr->lexeme;
prs->words[prs->curwords].nvariant = ptr->nvariant;
prs->words[prs->curwords].alen = 0;
prs->words[prs->curwords].pos.pos = LIMITPOS(prs->pos);
ptr++;
@ -458,10 +459,10 @@ hlparsetext(TSCfgInfo * cfg, HLPRSTEXT * prs, QUERYTYPE * query, char *buf, int4
for (i = 0; i < cfg->map[type].len; i++)
{
DictInfo *dict = finddict(DatumGetObjectId(cfg->map[type].dict_id[i]));
char **norms,
**ptr;
TSLexeme *norms,
*ptr;
norms = ptr = (char **) DatumGetPointer(
norms = ptr = (TSLexeme *) DatumGetPointer(
FunctionCall3(
&(dict->lexize_info),
PointerGetDatum(dict->dictionary),
@ -472,10 +473,10 @@ hlparsetext(TSCfgInfo * cfg, HLPRSTEXT * prs, QUERYTYPE * query, char *buf, int4
if (!norms) /* dictionary doesn't know this lexem */
continue;
while (*ptr)
while (ptr->lexeme)
{
hlfinditem(prs, query, *ptr, strlen(*ptr));
pfree(*ptr);
hlfinditem(prs, query, ptr->lexeme, strlen(ptr->lexeme));
pfree(ptr->lexeme);
ptr++;
}
pfree(norms);