mirror of
https://github.com/postgres/postgres.git
synced 2025-09-11 00:12:06 +03:00
We used to strategically place newlines after some function call left parentheses to make pgindent move the argument list a few chars to the left, so that the whole line would fit under 80 chars. However, pgindent no longer does that, so the newlines just made the code vertically longer for no reason. Remove those newlines, and reflow some of those lines for some extra naturality. Reviewed-by: Michael Paquier, Tom Lane Discussion: https://postgr.es/m/20200129200401.GA6303@alvherre.pgsql
883 lines
18 KiB
C
883 lines
18 KiB
C
/*-------------------------------------------------------------------------
|
|
*
|
|
* dict_thesaurus.c
|
|
* Thesaurus dictionary: phrase to phrase substitution
|
|
*
|
|
* Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
|
|
*
|
|
*
|
|
* IDENTIFICATION
|
|
* src/backend/tsearch/dict_thesaurus.c
|
|
*
|
|
*-------------------------------------------------------------------------
|
|
*/
|
|
#include "postgres.h"
|
|
|
|
#include "catalog/namespace.h"
|
|
#include "commands/defrem.h"
|
|
#include "tsearch/ts_cache.h"
|
|
#include "tsearch/ts_locale.h"
|
|
#include "tsearch/ts_utils.h"
|
|
#include "utils/builtins.h"
|
|
#include "utils/regproc.h"
|
|
|
|
|
|
/*
|
|
* Temporary we use TSLexeme.flags for inner use...
|
|
*/
|
|
#define DT_USEASIS 0x1000
|
|
|
|
typedef struct LexemeInfo
|
|
{
|
|
uint32 idsubst; /* entry's number in DictThesaurus->subst */
|
|
uint16 posinsubst; /* pos info in entry */
|
|
uint16 tnvariant; /* total num lexemes in one variant */
|
|
struct LexemeInfo *nextentry;
|
|
struct LexemeInfo *nextvariant;
|
|
} LexemeInfo;
|
|
|
|
typedef struct
|
|
{
|
|
char *lexeme;
|
|
LexemeInfo *entries;
|
|
} TheLexeme;
|
|
|
|
typedef struct
|
|
{
|
|
uint16 lastlexeme; /* number lexemes to substitute */
|
|
uint16 reslen;
|
|
TSLexeme *res; /* prepared substituted result */
|
|
} TheSubstitute;
|
|
|
|
typedef struct
|
|
{
|
|
/* subdictionary to normalize lexemes */
|
|
Oid subdictOid;
|
|
TSDictionaryCacheEntry *subdict;
|
|
|
|
/* Array to search lexeme by exact match */
|
|
TheLexeme *wrds;
|
|
int nwrds; /* current number of words */
|
|
int ntwrds; /* allocated array length */
|
|
|
|
/*
|
|
* Storage of substituted result, n-th element is for n-th expression
|
|
*/
|
|
TheSubstitute *subst;
|
|
int nsubst;
|
|
} DictThesaurus;
|
|
|
|
|
|
static void
|
|
newLexeme(DictThesaurus *d, char *b, char *e, uint32 idsubst, uint16 posinsubst)
|
|
{
|
|
TheLexeme *ptr;
|
|
|
|
if (d->nwrds >= d->ntwrds)
|
|
{
|
|
if (d->ntwrds == 0)
|
|
{
|
|
d->ntwrds = 16;
|
|
d->wrds = (TheLexeme *) palloc(sizeof(TheLexeme) * d->ntwrds);
|
|
}
|
|
else
|
|
{
|
|
d->ntwrds *= 2;
|
|
d->wrds = (TheLexeme *) repalloc(d->wrds, sizeof(TheLexeme) * d->ntwrds);
|
|
}
|
|
}
|
|
|
|
ptr = d->wrds + d->nwrds;
|
|
d->nwrds++;
|
|
|
|
ptr->lexeme = palloc(e - b + 1);
|
|
|
|
memcpy(ptr->lexeme, b, e - b);
|
|
ptr->lexeme[e - b] = '\0';
|
|
|
|
ptr->entries = (LexemeInfo *) palloc(sizeof(LexemeInfo));
|
|
|
|
ptr->entries->nextentry = NULL;
|
|
ptr->entries->idsubst = idsubst;
|
|
ptr->entries->posinsubst = posinsubst;
|
|
}
|
|
|
|
static void
|
|
addWrd(DictThesaurus *d, char *b, char *e, uint32 idsubst, uint16 nwrd, uint16 posinsubst, bool useasis)
|
|
{
|
|
static int nres = 0;
|
|
static int ntres = 0;
|
|
TheSubstitute *ptr;
|
|
|
|
if (nwrd == 0)
|
|
{
|
|
nres = ntres = 0;
|
|
|
|
if (idsubst >= d->nsubst)
|
|
{
|
|
if (d->nsubst == 0)
|
|
{
|
|
d->nsubst = 16;
|
|
d->subst = (TheSubstitute *) palloc(sizeof(TheSubstitute) * d->nsubst);
|
|
}
|
|
else
|
|
{
|
|
d->nsubst *= 2;
|
|
d->subst = (TheSubstitute *) repalloc(d->subst, sizeof(TheSubstitute) * d->nsubst);
|
|
}
|
|
}
|
|
}
|
|
|
|
ptr = d->subst + idsubst;
|
|
|
|
ptr->lastlexeme = posinsubst - 1;
|
|
|
|
if (nres + 1 >= ntres)
|
|
{
|
|
if (ntres == 0)
|
|
{
|
|
ntres = 2;
|
|
ptr->res = (TSLexeme *) palloc(sizeof(TSLexeme) * ntres);
|
|
}
|
|
else
|
|
{
|
|
ntres *= 2;
|
|
ptr->res = (TSLexeme *) repalloc(ptr->res, sizeof(TSLexeme) * ntres);
|
|
}
|
|
}
|
|
|
|
ptr->res[nres].lexeme = palloc(e - b + 1);
|
|
memcpy(ptr->res[nres].lexeme, b, e - b);
|
|
ptr->res[nres].lexeme[e - b] = '\0';
|
|
|
|
ptr->res[nres].nvariant = nwrd;
|
|
if (useasis)
|
|
ptr->res[nres].flags = DT_USEASIS;
|
|
else
|
|
ptr->res[nres].flags = 0;
|
|
|
|
ptr->res[++nres].lexeme = NULL;
|
|
}
|
|
|
|
#define TR_WAITLEX 1
|
|
#define TR_INLEX 2
|
|
#define TR_WAITSUBS 3
|
|
#define TR_INSUBS 4
|
|
|
|
static void
|
|
thesaurusRead(const char *filename, DictThesaurus *d)
|
|
{
|
|
tsearch_readline_state trst;
|
|
uint32 idsubst = 0;
|
|
bool useasis = false;
|
|
char *line;
|
|
|
|
filename = get_tsearch_config_filename(filename, "ths");
|
|
if (!tsearch_readline_begin(&trst, filename))
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_CONFIG_FILE_ERROR),
|
|
errmsg("could not open thesaurus file \"%s\": %m",
|
|
filename)));
|
|
|
|
while ((line = tsearch_readline(&trst)) != NULL)
|
|
{
|
|
char *ptr;
|
|
int state = TR_WAITLEX;
|
|
char *beginwrd = NULL;
|
|
uint32 posinsubst = 0;
|
|
uint32 nwrd = 0;
|
|
|
|
ptr = line;
|
|
|
|
/* is it a comment? */
|
|
while (*ptr && t_isspace(ptr))
|
|
ptr += pg_mblen(ptr);
|
|
|
|
if (t_iseq(ptr, '#') || *ptr == '\0' ||
|
|
t_iseq(ptr, '\n') || t_iseq(ptr, '\r'))
|
|
{
|
|
pfree(line);
|
|
continue;
|
|
}
|
|
|
|
while (*ptr)
|
|
{
|
|
if (state == TR_WAITLEX)
|
|
{
|
|
if (t_iseq(ptr, ':'))
|
|
{
|
|
if (posinsubst == 0)
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_CONFIG_FILE_ERROR),
|
|
errmsg("unexpected delimiter")));
|
|
state = TR_WAITSUBS;
|
|
}
|
|
else if (!t_isspace(ptr))
|
|
{
|
|
beginwrd = ptr;
|
|
state = TR_INLEX;
|
|
}
|
|
}
|
|
else if (state == TR_INLEX)
|
|
{
|
|
if (t_iseq(ptr, ':'))
|
|
{
|
|
newLexeme(d, beginwrd, ptr, idsubst, posinsubst++);
|
|
state = TR_WAITSUBS;
|
|
}
|
|
else if (t_isspace(ptr))
|
|
{
|
|
newLexeme(d, beginwrd, ptr, idsubst, posinsubst++);
|
|
state = TR_WAITLEX;
|
|
}
|
|
}
|
|
else if (state == TR_WAITSUBS)
|
|
{
|
|
if (t_iseq(ptr, '*'))
|
|
{
|
|
useasis = true;
|
|
state = TR_INSUBS;
|
|
beginwrd = ptr + pg_mblen(ptr);
|
|
}
|
|
else if (t_iseq(ptr, '\\'))
|
|
{
|
|
useasis = false;
|
|
state = TR_INSUBS;
|
|
beginwrd = ptr + pg_mblen(ptr);
|
|
}
|
|
else if (!t_isspace(ptr))
|
|
{
|
|
useasis = false;
|
|
beginwrd = ptr;
|
|
state = TR_INSUBS;
|
|
}
|
|
}
|
|
else if (state == TR_INSUBS)
|
|
{
|
|
if (t_isspace(ptr))
|
|
{
|
|
if (ptr == beginwrd)
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_CONFIG_FILE_ERROR),
|
|
errmsg("unexpected end of line or lexeme")));
|
|
addWrd(d, beginwrd, ptr, idsubst, nwrd++, posinsubst, useasis);
|
|
state = TR_WAITSUBS;
|
|
}
|
|
}
|
|
else
|
|
elog(ERROR, "unrecognized thesaurus state: %d", state);
|
|
|
|
ptr += pg_mblen(ptr);
|
|
}
|
|
|
|
if (state == TR_INSUBS)
|
|
{
|
|
if (ptr == beginwrd)
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_CONFIG_FILE_ERROR),
|
|
errmsg("unexpected end of line or lexeme")));
|
|
addWrd(d, beginwrd, ptr, idsubst, nwrd++, posinsubst, useasis);
|
|
}
|
|
|
|
idsubst++;
|
|
|
|
if (!(nwrd && posinsubst))
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_CONFIG_FILE_ERROR),
|
|
errmsg("unexpected end of line")));
|
|
|
|
/*
|
|
* Note: currently, tsearch_readline can't return lines exceeding 4KB,
|
|
* so overflow of the word counts is impossible. But that may not
|
|
* always be true, so let's check.
|
|
*/
|
|
if (nwrd != (uint16) nwrd || posinsubst != (uint16) posinsubst)
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_CONFIG_FILE_ERROR),
|
|
errmsg("too many lexemes in thesaurus entry")));
|
|
|
|
pfree(line);
|
|
}
|
|
|
|
d->nsubst = idsubst;
|
|
|
|
tsearch_readline_end(&trst);
|
|
}
|
|
|
|
static TheLexeme *
|
|
addCompiledLexeme(TheLexeme *newwrds, int *nnw, int *tnm, TSLexeme *lexeme, LexemeInfo *src, uint16 tnvariant)
|
|
{
|
|
if (*nnw >= *tnm)
|
|
{
|
|
*tnm *= 2;
|
|
newwrds = (TheLexeme *) repalloc(newwrds, sizeof(TheLexeme) * *tnm);
|
|
}
|
|
|
|
newwrds[*nnw].entries = (LexemeInfo *) palloc(sizeof(LexemeInfo));
|
|
|
|
if (lexeme && lexeme->lexeme)
|
|
{
|
|
newwrds[*nnw].lexeme = pstrdup(lexeme->lexeme);
|
|
newwrds[*nnw].entries->tnvariant = tnvariant;
|
|
}
|
|
else
|
|
{
|
|
newwrds[*nnw].lexeme = NULL;
|
|
newwrds[*nnw].entries->tnvariant = 1;
|
|
}
|
|
|
|
newwrds[*nnw].entries->idsubst = src->idsubst;
|
|
newwrds[*nnw].entries->posinsubst = src->posinsubst;
|
|
|
|
newwrds[*nnw].entries->nextentry = NULL;
|
|
|
|
(*nnw)++;
|
|
return newwrds;
|
|
}
|
|
|
|
static int
|
|
cmpLexemeInfo(LexemeInfo *a, LexemeInfo *b)
|
|
{
|
|
if (a == NULL || b == NULL)
|
|
return 0;
|
|
|
|
if (a->idsubst == b->idsubst)
|
|
{
|
|
if (a->posinsubst == b->posinsubst)
|
|
{
|
|
if (a->tnvariant == b->tnvariant)
|
|
return 0;
|
|
|
|
return (a->tnvariant > b->tnvariant) ? 1 : -1;
|
|
}
|
|
|
|
return (a->posinsubst > b->posinsubst) ? 1 : -1;
|
|
}
|
|
|
|
return (a->idsubst > b->idsubst) ? 1 : -1;
|
|
}
|
|
|
|
static int
|
|
cmpLexeme(const TheLexeme *a, const TheLexeme *b)
|
|
{
|
|
if (a->lexeme == NULL)
|
|
{
|
|
if (b->lexeme == NULL)
|
|
return 0;
|
|
else
|
|
return 1;
|
|
}
|
|
else if (b->lexeme == NULL)
|
|
return -1;
|
|
|
|
return strcmp(a->lexeme, b->lexeme);
|
|
}
|
|
|
|
static int
|
|
cmpLexemeQ(const void *a, const void *b)
|
|
{
|
|
return cmpLexeme((const TheLexeme *) a, (const TheLexeme *) b);
|
|
}
|
|
|
|
static int
|
|
cmpTheLexeme(const void *a, const void *b)
|
|
{
|
|
const TheLexeme *la = (const TheLexeme *) a;
|
|
const TheLexeme *lb = (const TheLexeme *) b;
|
|
int res;
|
|
|
|
if ((res = cmpLexeme(la, lb)) != 0)
|
|
return res;
|
|
|
|
return -cmpLexemeInfo(la->entries, lb->entries);
|
|
}
|
|
|
|
static void
|
|
compileTheLexeme(DictThesaurus *d)
|
|
{
|
|
int i,
|
|
nnw = 0,
|
|
tnm = 16;
|
|
TheLexeme *newwrds = (TheLexeme *) palloc(sizeof(TheLexeme) * tnm),
|
|
*ptrwrds;
|
|
|
|
for (i = 0; i < d->nwrds; i++)
|
|
{
|
|
TSLexeme *ptr;
|
|
|
|
if (strcmp(d->wrds[i].lexeme, "?") == 0) /* Is stop word marker? */
|
|
newwrds = addCompiledLexeme(newwrds, &nnw, &tnm, NULL, d->wrds[i].entries, 0);
|
|
else
|
|
{
|
|
ptr = (TSLexeme *) DatumGetPointer(FunctionCall4(&(d->subdict->lexize),
|
|
PointerGetDatum(d->subdict->dictData),
|
|
PointerGetDatum(d->wrds[i].lexeme),
|
|
Int32GetDatum(strlen(d->wrds[i].lexeme)),
|
|
PointerGetDatum(NULL)));
|
|
|
|
if (!ptr)
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_CONFIG_FILE_ERROR),
|
|
errmsg("thesaurus sample word \"%s\" isn't recognized by subdictionary (rule %d)",
|
|
d->wrds[i].lexeme,
|
|
d->wrds[i].entries->idsubst + 1)));
|
|
else if (!(ptr->lexeme))
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_CONFIG_FILE_ERROR),
|
|
errmsg("thesaurus sample word \"%s\" is a stop word (rule %d)",
|
|
d->wrds[i].lexeme,
|
|
d->wrds[i].entries->idsubst + 1),
|
|
errhint("Use \"?\" to represent a stop word within a sample phrase.")));
|
|
else
|
|
{
|
|
while (ptr->lexeme)
|
|
{
|
|
TSLexeme *remptr = ptr + 1;
|
|
int tnvar = 1;
|
|
int curvar = ptr->nvariant;
|
|
|
|
/* compute n words in one variant */
|
|
while (remptr->lexeme)
|
|
{
|
|
if (remptr->nvariant != (remptr - 1)->nvariant)
|
|
break;
|
|
tnvar++;
|
|
remptr++;
|
|
}
|
|
|
|
remptr = ptr;
|
|
while (remptr->lexeme && remptr->nvariant == curvar)
|
|
{
|
|
newwrds = addCompiledLexeme(newwrds, &nnw, &tnm, remptr, d->wrds[i].entries, tnvar);
|
|
remptr++;
|
|
}
|
|
|
|
ptr = remptr;
|
|
}
|
|
}
|
|
}
|
|
|
|
pfree(d->wrds[i].lexeme);
|
|
pfree(d->wrds[i].entries);
|
|
}
|
|
|
|
if (d->wrds)
|
|
pfree(d->wrds);
|
|
d->wrds = newwrds;
|
|
d->nwrds = nnw;
|
|
d->ntwrds = tnm;
|
|
|
|
if (d->nwrds > 1)
|
|
{
|
|
qsort(d->wrds, d->nwrds, sizeof(TheLexeme), cmpTheLexeme);
|
|
|
|
/* uniq */
|
|
newwrds = d->wrds;
|
|
ptrwrds = d->wrds + 1;
|
|
while (ptrwrds - d->wrds < d->nwrds)
|
|
{
|
|
if (cmpLexeme(ptrwrds, newwrds) == 0)
|
|
{
|
|
if (cmpLexemeInfo(ptrwrds->entries, newwrds->entries))
|
|
{
|
|
ptrwrds->entries->nextentry = newwrds->entries;
|
|
newwrds->entries = ptrwrds->entries;
|
|
}
|
|
else
|
|
pfree(ptrwrds->entries);
|
|
|
|
if (ptrwrds->lexeme)
|
|
pfree(ptrwrds->lexeme);
|
|
}
|
|
else
|
|
{
|
|
newwrds++;
|
|
*newwrds = *ptrwrds;
|
|
}
|
|
|
|
ptrwrds++;
|
|
}
|
|
|
|
d->nwrds = newwrds - d->wrds + 1;
|
|
d->wrds = (TheLexeme *) repalloc(d->wrds, sizeof(TheLexeme) * d->nwrds);
|
|
}
|
|
}
|
|
|
|
static void
|
|
compileTheSubstitute(DictThesaurus *d)
|
|
{
|
|
int i;
|
|
|
|
for (i = 0; i < d->nsubst; i++)
|
|
{
|
|
TSLexeme *rem = d->subst[i].res,
|
|
*outptr,
|
|
*inptr;
|
|
int n = 2;
|
|
|
|
outptr = d->subst[i].res = (TSLexeme *) palloc(sizeof(TSLexeme) * n);
|
|
outptr->lexeme = NULL;
|
|
inptr = rem;
|
|
|
|
while (inptr && inptr->lexeme)
|
|
{
|
|
TSLexeme *lexized,
|
|
tmplex[2];
|
|
|
|
if (inptr->flags & DT_USEASIS)
|
|
{ /* do not lexize */
|
|
tmplex[0] = *inptr;
|
|
tmplex[0].flags = 0;
|
|
tmplex[1].lexeme = NULL;
|
|
lexized = tmplex;
|
|
}
|
|
else
|
|
{
|
|
lexized = (TSLexeme *) DatumGetPointer(FunctionCall4(&(d->subdict->lexize),
|
|
PointerGetDatum(d->subdict->dictData),
|
|
PointerGetDatum(inptr->lexeme),
|
|
Int32GetDatum(strlen(inptr->lexeme)),
|
|
PointerGetDatum(NULL)));
|
|
}
|
|
|
|
if (lexized && lexized->lexeme)
|
|
{
|
|
int toset = (lexized->lexeme && outptr != d->subst[i].res) ? (outptr - d->subst[i].res) : -1;
|
|
|
|
while (lexized->lexeme)
|
|
{
|
|
if (outptr - d->subst[i].res + 1 >= n)
|
|
{
|
|
int diff = outptr - d->subst[i].res;
|
|
|
|
n *= 2;
|
|
d->subst[i].res = (TSLexeme *) repalloc(d->subst[i].res, sizeof(TSLexeme) * n);
|
|
outptr = d->subst[i].res + diff;
|
|
}
|
|
|
|
*outptr = *lexized;
|
|
outptr->lexeme = pstrdup(lexized->lexeme);
|
|
|
|
outptr++;
|
|
lexized++;
|
|
}
|
|
|
|
if (toset > 0)
|
|
d->subst[i].res[toset].flags |= TSL_ADDPOS;
|
|
}
|
|
else if (lexized)
|
|
{
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_CONFIG_FILE_ERROR),
|
|
errmsg("thesaurus substitute word \"%s\" is a stop word (rule %d)",
|
|
inptr->lexeme, i + 1)));
|
|
}
|
|
else
|
|
{
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_CONFIG_FILE_ERROR),
|
|
errmsg("thesaurus substitute word \"%s\" isn't recognized by subdictionary (rule %d)",
|
|
inptr->lexeme, i + 1)));
|
|
}
|
|
|
|
if (inptr->lexeme)
|
|
pfree(inptr->lexeme);
|
|
inptr++;
|
|
}
|
|
|
|
if (outptr == d->subst[i].res)
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_CONFIG_FILE_ERROR),
|
|
errmsg("thesaurus substitute phrase is empty (rule %d)",
|
|
i + 1)));
|
|
|
|
d->subst[i].reslen = outptr - d->subst[i].res;
|
|
|
|
pfree(rem);
|
|
}
|
|
}
|
|
|
|
Datum
|
|
thesaurus_init(PG_FUNCTION_ARGS)
|
|
{
|
|
List *dictoptions = (List *) PG_GETARG_POINTER(0);
|
|
DictThesaurus *d;
|
|
char *subdictname = NULL;
|
|
bool fileloaded = false;
|
|
ListCell *l;
|
|
|
|
d = (DictThesaurus *) palloc0(sizeof(DictThesaurus));
|
|
|
|
foreach(l, dictoptions)
|
|
{
|
|
DefElem *defel = (DefElem *) lfirst(l);
|
|
|
|
if (strcmp(defel->defname, "dictfile") == 0)
|
|
{
|
|
if (fileloaded)
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
|
|
errmsg("multiple DictFile parameters")));
|
|
thesaurusRead(defGetString(defel), d);
|
|
fileloaded = true;
|
|
}
|
|
else if (strcmp(defel->defname, "dictionary") == 0)
|
|
{
|
|
if (subdictname)
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
|
|
errmsg("multiple Dictionary parameters")));
|
|
subdictname = pstrdup(defGetString(defel));
|
|
}
|
|
else
|
|
{
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
|
|
errmsg("unrecognized Thesaurus parameter: \"%s\"",
|
|
defel->defname)));
|
|
}
|
|
}
|
|
|
|
if (!fileloaded)
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
|
|
errmsg("missing DictFile parameter")));
|
|
if (!subdictname)
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
|
|
errmsg("missing Dictionary parameter")));
|
|
|
|
d->subdictOid = get_ts_dict_oid(stringToQualifiedNameList(subdictname), false);
|
|
d->subdict = lookup_ts_dictionary_cache(d->subdictOid);
|
|
|
|
compileTheLexeme(d);
|
|
compileTheSubstitute(d);
|
|
|
|
PG_RETURN_POINTER(d);
|
|
}
|
|
|
|
static LexemeInfo *
|
|
findTheLexeme(DictThesaurus *d, char *lexeme)
|
|
{
|
|
TheLexeme key,
|
|
*res;
|
|
|
|
if (d->nwrds == 0)
|
|
return NULL;
|
|
|
|
key.lexeme = lexeme;
|
|
key.entries = NULL;
|
|
|
|
res = bsearch(&key, d->wrds, d->nwrds, sizeof(TheLexeme), cmpLexemeQ);
|
|
|
|
if (res == NULL)
|
|
return NULL;
|
|
return res->entries;
|
|
}
|
|
|
|
static bool
|
|
matchIdSubst(LexemeInfo *stored, uint32 idsubst)
|
|
{
|
|
bool res = true;
|
|
|
|
if (stored)
|
|
{
|
|
res = false;
|
|
|
|
for (; stored; stored = stored->nextvariant)
|
|
if (stored->idsubst == idsubst)
|
|
{
|
|
res = true;
|
|
break;
|
|
}
|
|
}
|
|
|
|
return res;
|
|
}
|
|
|
|
static LexemeInfo *
|
|
findVariant(LexemeInfo *in, LexemeInfo *stored, uint16 curpos, LexemeInfo **newin, int newn)
|
|
{
|
|
for (;;)
|
|
{
|
|
int i;
|
|
LexemeInfo *ptr = newin[0];
|
|
|
|
for (i = 0; i < newn; i++)
|
|
{
|
|
while (newin[i] && newin[i]->idsubst < ptr->idsubst)
|
|
newin[i] = newin[i]->nextentry;
|
|
|
|
if (newin[i] == NULL)
|
|
return in;
|
|
|
|
if (newin[i]->idsubst > ptr->idsubst)
|
|
{
|
|
ptr = newin[i];
|
|
i = -1;
|
|
continue;
|
|
}
|
|
|
|
while (newin[i]->idsubst == ptr->idsubst)
|
|
{
|
|
if (newin[i]->posinsubst == curpos && newin[i]->tnvariant == newn)
|
|
{
|
|
ptr = newin[i];
|
|
break;
|
|
}
|
|
|
|
newin[i] = newin[i]->nextentry;
|
|
if (newin[i] == NULL)
|
|
return in;
|
|
}
|
|
|
|
if (newin[i]->idsubst != ptr->idsubst)
|
|
{
|
|
ptr = newin[i];
|
|
i = -1;
|
|
continue;
|
|
}
|
|
}
|
|
|
|
if (i == newn && matchIdSubst(stored, ptr->idsubst) && (in == NULL || !matchIdSubst(in, ptr->idsubst)))
|
|
{ /* found */
|
|
|
|
ptr->nextvariant = in;
|
|
in = ptr;
|
|
}
|
|
|
|
/* step forward */
|
|
for (i = 0; i < newn; i++)
|
|
newin[i] = newin[i]->nextentry;
|
|
}
|
|
}
|
|
|
|
static TSLexeme *
|
|
copyTSLexeme(TheSubstitute *ts)
|
|
{
|
|
TSLexeme *res;
|
|
uint16 i;
|
|
|
|
res = (TSLexeme *) palloc(sizeof(TSLexeme) * (ts->reslen + 1));
|
|
for (i = 0; i < ts->reslen; i++)
|
|
{
|
|
res[i] = ts->res[i];
|
|
res[i].lexeme = pstrdup(ts->res[i].lexeme);
|
|
}
|
|
|
|
res[ts->reslen].lexeme = NULL;
|
|
|
|
return res;
|
|
}
|
|
|
|
static TSLexeme *
|
|
checkMatch(DictThesaurus *d, LexemeInfo *info, uint16 curpos, bool *moreres)
|
|
{
|
|
*moreres = false;
|
|
while (info)
|
|
{
|
|
Assert(info->idsubst < d->nsubst);
|
|
if (info->nextvariant)
|
|
*moreres = true;
|
|
if (d->subst[info->idsubst].lastlexeme == curpos)
|
|
return copyTSLexeme(d->subst + info->idsubst);
|
|
info = info->nextvariant;
|
|
}
|
|
|
|
return NULL;
|
|
}
|
|
|
|
Datum
|
|
thesaurus_lexize(PG_FUNCTION_ARGS)
|
|
{
|
|
DictThesaurus *d = (DictThesaurus *) PG_GETARG_POINTER(0);
|
|
DictSubState *dstate = (DictSubState *) PG_GETARG_POINTER(3);
|
|
TSLexeme *res = NULL;
|
|
LexemeInfo *stored,
|
|
*info = NULL;
|
|
uint16 curpos = 0;
|
|
bool moreres = false;
|
|
|
|
if (PG_NARGS() != 4 || dstate == NULL)
|
|
elog(ERROR, "forbidden call of thesaurus or nested call");
|
|
|
|
if (dstate->isend)
|
|
PG_RETURN_POINTER(NULL);
|
|
stored = (LexemeInfo *) dstate->private_state;
|
|
|
|
if (stored)
|
|
curpos = stored->posinsubst + 1;
|
|
|
|
if (!d->subdict->isvalid)
|
|
d->subdict = lookup_ts_dictionary_cache(d->subdictOid);
|
|
|
|
res = (TSLexeme *) DatumGetPointer(FunctionCall4(&(d->subdict->lexize),
|
|
PointerGetDatum(d->subdict->dictData),
|
|
PG_GETARG_DATUM(1),
|
|
PG_GETARG_DATUM(2),
|
|
PointerGetDatum(NULL)));
|
|
|
|
if (res && res->lexeme)
|
|
{
|
|
TSLexeme *ptr = res,
|
|
*basevar;
|
|
|
|
while (ptr->lexeme)
|
|
{
|
|
uint16 nv = ptr->nvariant;
|
|
uint16 i,
|
|
nlex = 0;
|
|
LexemeInfo **infos;
|
|
|
|
basevar = ptr;
|
|
while (ptr->lexeme && nv == ptr->nvariant)
|
|
{
|
|
nlex++;
|
|
ptr++;
|
|
}
|
|
|
|
infos = (LexemeInfo **) palloc(sizeof(LexemeInfo *) * nlex);
|
|
for (i = 0; i < nlex; i++)
|
|
if ((infos[i] = findTheLexeme(d, basevar[i].lexeme)) == NULL)
|
|
break;
|
|
|
|
if (i < nlex)
|
|
{
|
|
/* no chance to find */
|
|
pfree(infos);
|
|
continue;
|
|
}
|
|
|
|
info = findVariant(info, stored, curpos, infos, nlex);
|
|
}
|
|
}
|
|
else if (res)
|
|
{ /* stop-word */
|
|
LexemeInfo *infos = findTheLexeme(d, NULL);
|
|
|
|
info = findVariant(NULL, stored, curpos, &infos, 1);
|
|
}
|
|
else
|
|
{
|
|
info = NULL; /* word isn't recognized */
|
|
}
|
|
|
|
dstate->private_state = (void *) info;
|
|
|
|
if (!info)
|
|
{
|
|
dstate->getnext = false;
|
|
PG_RETURN_POINTER(NULL);
|
|
}
|
|
|
|
if ((res = checkMatch(d, info, curpos, &moreres)) != NULL)
|
|
{
|
|
dstate->getnext = moreres;
|
|
PG_RETURN_POINTER(res);
|
|
}
|
|
|
|
dstate->getnext = true;
|
|
|
|
PG_RETURN_POINTER(NULL);
|
|
}
|