1
0
mirror of https://github.com/postgres/postgres.git synced 2025-09-11 00:12:06 +03:00
Files
postgres/src/backend/tsearch/dict_thesaurus.c
Alvaro Herrera c9d2977519 Clean up newlines following left parentheses
We used to strategically place newlines after some function call left
parentheses to make pgindent move the argument list a few chars to the
left, so that the whole line would fit under 80 chars.  However,
pgindent no longer does that, so the newlines just made the code
vertically longer for no reason.  Remove those newlines, and reflow some
of those lines for some extra naturality.

Reviewed-by: Michael Paquier, Tom Lane
Discussion: https://postgr.es/m/20200129200401.GA6303@alvherre.pgsql
2020-01-30 13:42:14 -03:00

883 lines
18 KiB
C

/*-------------------------------------------------------------------------
*
* dict_thesaurus.c
* Thesaurus dictionary: phrase to phrase substitution
*
* Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
*
*
* IDENTIFICATION
* src/backend/tsearch/dict_thesaurus.c
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "catalog/namespace.h"
#include "commands/defrem.h"
#include "tsearch/ts_cache.h"
#include "tsearch/ts_locale.h"
#include "tsearch/ts_utils.h"
#include "utils/builtins.h"
#include "utils/regproc.h"
/*
* Temporary we use TSLexeme.flags for inner use...
*/
#define DT_USEASIS 0x1000
typedef struct LexemeInfo
{
uint32 idsubst; /* entry's number in DictThesaurus->subst */
uint16 posinsubst; /* pos info in entry */
uint16 tnvariant; /* total num lexemes in one variant */
struct LexemeInfo *nextentry;
struct LexemeInfo *nextvariant;
} LexemeInfo;
typedef struct
{
char *lexeme;
LexemeInfo *entries;
} TheLexeme;
typedef struct
{
uint16 lastlexeme; /* number lexemes to substitute */
uint16 reslen;
TSLexeme *res; /* prepared substituted result */
} TheSubstitute;
typedef struct
{
/* subdictionary to normalize lexemes */
Oid subdictOid;
TSDictionaryCacheEntry *subdict;
/* Array to search lexeme by exact match */
TheLexeme *wrds;
int nwrds; /* current number of words */
int ntwrds; /* allocated array length */
/*
* Storage of substituted result, n-th element is for n-th expression
*/
TheSubstitute *subst;
int nsubst;
} DictThesaurus;
static void
newLexeme(DictThesaurus *d, char *b, char *e, uint32 idsubst, uint16 posinsubst)
{
TheLexeme *ptr;
if (d->nwrds >= d->ntwrds)
{
if (d->ntwrds == 0)
{
d->ntwrds = 16;
d->wrds = (TheLexeme *) palloc(sizeof(TheLexeme) * d->ntwrds);
}
else
{
d->ntwrds *= 2;
d->wrds = (TheLexeme *) repalloc(d->wrds, sizeof(TheLexeme) * d->ntwrds);
}
}
ptr = d->wrds + d->nwrds;
d->nwrds++;
ptr->lexeme = palloc(e - b + 1);
memcpy(ptr->lexeme, b, e - b);
ptr->lexeme[e - b] = '\0';
ptr->entries = (LexemeInfo *) palloc(sizeof(LexemeInfo));
ptr->entries->nextentry = NULL;
ptr->entries->idsubst = idsubst;
ptr->entries->posinsubst = posinsubst;
}
static void
addWrd(DictThesaurus *d, char *b, char *e, uint32 idsubst, uint16 nwrd, uint16 posinsubst, bool useasis)
{
static int nres = 0;
static int ntres = 0;
TheSubstitute *ptr;
if (nwrd == 0)
{
nres = ntres = 0;
if (idsubst >= d->nsubst)
{
if (d->nsubst == 0)
{
d->nsubst = 16;
d->subst = (TheSubstitute *) palloc(sizeof(TheSubstitute) * d->nsubst);
}
else
{
d->nsubst *= 2;
d->subst = (TheSubstitute *) repalloc(d->subst, sizeof(TheSubstitute) * d->nsubst);
}
}
}
ptr = d->subst + idsubst;
ptr->lastlexeme = posinsubst - 1;
if (nres + 1 >= ntres)
{
if (ntres == 0)
{
ntres = 2;
ptr->res = (TSLexeme *) palloc(sizeof(TSLexeme) * ntres);
}
else
{
ntres *= 2;
ptr->res = (TSLexeme *) repalloc(ptr->res, sizeof(TSLexeme) * ntres);
}
}
ptr->res[nres].lexeme = palloc(e - b + 1);
memcpy(ptr->res[nres].lexeme, b, e - b);
ptr->res[nres].lexeme[e - b] = '\0';
ptr->res[nres].nvariant = nwrd;
if (useasis)
ptr->res[nres].flags = DT_USEASIS;
else
ptr->res[nres].flags = 0;
ptr->res[++nres].lexeme = NULL;
}
#define TR_WAITLEX 1
#define TR_INLEX 2
#define TR_WAITSUBS 3
#define TR_INSUBS 4
static void
thesaurusRead(const char *filename, DictThesaurus *d)
{
tsearch_readline_state trst;
uint32 idsubst = 0;
bool useasis = false;
char *line;
filename = get_tsearch_config_filename(filename, "ths");
if (!tsearch_readline_begin(&trst, filename))
ereport(ERROR,
(errcode(ERRCODE_CONFIG_FILE_ERROR),
errmsg("could not open thesaurus file \"%s\": %m",
filename)));
while ((line = tsearch_readline(&trst)) != NULL)
{
char *ptr;
int state = TR_WAITLEX;
char *beginwrd = NULL;
uint32 posinsubst = 0;
uint32 nwrd = 0;
ptr = line;
/* is it a comment? */
while (*ptr && t_isspace(ptr))
ptr += pg_mblen(ptr);
if (t_iseq(ptr, '#') || *ptr == '\0' ||
t_iseq(ptr, '\n') || t_iseq(ptr, '\r'))
{
pfree(line);
continue;
}
while (*ptr)
{
if (state == TR_WAITLEX)
{
if (t_iseq(ptr, ':'))
{
if (posinsubst == 0)
ereport(ERROR,
(errcode(ERRCODE_CONFIG_FILE_ERROR),
errmsg("unexpected delimiter")));
state = TR_WAITSUBS;
}
else if (!t_isspace(ptr))
{
beginwrd = ptr;
state = TR_INLEX;
}
}
else if (state == TR_INLEX)
{
if (t_iseq(ptr, ':'))
{
newLexeme(d, beginwrd, ptr, idsubst, posinsubst++);
state = TR_WAITSUBS;
}
else if (t_isspace(ptr))
{
newLexeme(d, beginwrd, ptr, idsubst, posinsubst++);
state = TR_WAITLEX;
}
}
else if (state == TR_WAITSUBS)
{
if (t_iseq(ptr, '*'))
{
useasis = true;
state = TR_INSUBS;
beginwrd = ptr + pg_mblen(ptr);
}
else if (t_iseq(ptr, '\\'))
{
useasis = false;
state = TR_INSUBS;
beginwrd = ptr + pg_mblen(ptr);
}
else if (!t_isspace(ptr))
{
useasis = false;
beginwrd = ptr;
state = TR_INSUBS;
}
}
else if (state == TR_INSUBS)
{
if (t_isspace(ptr))
{
if (ptr == beginwrd)
ereport(ERROR,
(errcode(ERRCODE_CONFIG_FILE_ERROR),
errmsg("unexpected end of line or lexeme")));
addWrd(d, beginwrd, ptr, idsubst, nwrd++, posinsubst, useasis);
state = TR_WAITSUBS;
}
}
else
elog(ERROR, "unrecognized thesaurus state: %d", state);
ptr += pg_mblen(ptr);
}
if (state == TR_INSUBS)
{
if (ptr == beginwrd)
ereport(ERROR,
(errcode(ERRCODE_CONFIG_FILE_ERROR),
errmsg("unexpected end of line or lexeme")));
addWrd(d, beginwrd, ptr, idsubst, nwrd++, posinsubst, useasis);
}
idsubst++;
if (!(nwrd && posinsubst))
ereport(ERROR,
(errcode(ERRCODE_CONFIG_FILE_ERROR),
errmsg("unexpected end of line")));
/*
* Note: currently, tsearch_readline can't return lines exceeding 4KB,
* so overflow of the word counts is impossible. But that may not
* always be true, so let's check.
*/
if (nwrd != (uint16) nwrd || posinsubst != (uint16) posinsubst)
ereport(ERROR,
(errcode(ERRCODE_CONFIG_FILE_ERROR),
errmsg("too many lexemes in thesaurus entry")));
pfree(line);
}
d->nsubst = idsubst;
tsearch_readline_end(&trst);
}
static TheLexeme *
addCompiledLexeme(TheLexeme *newwrds, int *nnw, int *tnm, TSLexeme *lexeme, LexemeInfo *src, uint16 tnvariant)
{
if (*nnw >= *tnm)
{
*tnm *= 2;
newwrds = (TheLexeme *) repalloc(newwrds, sizeof(TheLexeme) * *tnm);
}
newwrds[*nnw].entries = (LexemeInfo *) palloc(sizeof(LexemeInfo));
if (lexeme && lexeme->lexeme)
{
newwrds[*nnw].lexeme = pstrdup(lexeme->lexeme);
newwrds[*nnw].entries->tnvariant = tnvariant;
}
else
{
newwrds[*nnw].lexeme = NULL;
newwrds[*nnw].entries->tnvariant = 1;
}
newwrds[*nnw].entries->idsubst = src->idsubst;
newwrds[*nnw].entries->posinsubst = src->posinsubst;
newwrds[*nnw].entries->nextentry = NULL;
(*nnw)++;
return newwrds;
}
static int
cmpLexemeInfo(LexemeInfo *a, LexemeInfo *b)
{
if (a == NULL || b == NULL)
return 0;
if (a->idsubst == b->idsubst)
{
if (a->posinsubst == b->posinsubst)
{
if (a->tnvariant == b->tnvariant)
return 0;
return (a->tnvariant > b->tnvariant) ? 1 : -1;
}
return (a->posinsubst > b->posinsubst) ? 1 : -1;
}
return (a->idsubst > b->idsubst) ? 1 : -1;
}
static int
cmpLexeme(const TheLexeme *a, const TheLexeme *b)
{
if (a->lexeme == NULL)
{
if (b->lexeme == NULL)
return 0;
else
return 1;
}
else if (b->lexeme == NULL)
return -1;
return strcmp(a->lexeme, b->lexeme);
}
static int
cmpLexemeQ(const void *a, const void *b)
{
return cmpLexeme((const TheLexeme *) a, (const TheLexeme *) b);
}
static int
cmpTheLexeme(const void *a, const void *b)
{
const TheLexeme *la = (const TheLexeme *) a;
const TheLexeme *lb = (const TheLexeme *) b;
int res;
if ((res = cmpLexeme(la, lb)) != 0)
return res;
return -cmpLexemeInfo(la->entries, lb->entries);
}
static void
compileTheLexeme(DictThesaurus *d)
{
int i,
nnw = 0,
tnm = 16;
TheLexeme *newwrds = (TheLexeme *) palloc(sizeof(TheLexeme) * tnm),
*ptrwrds;
for (i = 0; i < d->nwrds; i++)
{
TSLexeme *ptr;
if (strcmp(d->wrds[i].lexeme, "?") == 0) /* Is stop word marker? */
newwrds = addCompiledLexeme(newwrds, &nnw, &tnm, NULL, d->wrds[i].entries, 0);
else
{
ptr = (TSLexeme *) DatumGetPointer(FunctionCall4(&(d->subdict->lexize),
PointerGetDatum(d->subdict->dictData),
PointerGetDatum(d->wrds[i].lexeme),
Int32GetDatum(strlen(d->wrds[i].lexeme)),
PointerGetDatum(NULL)));
if (!ptr)
ereport(ERROR,
(errcode(ERRCODE_CONFIG_FILE_ERROR),
errmsg("thesaurus sample word \"%s\" isn't recognized by subdictionary (rule %d)",
d->wrds[i].lexeme,
d->wrds[i].entries->idsubst + 1)));
else if (!(ptr->lexeme))
ereport(ERROR,
(errcode(ERRCODE_CONFIG_FILE_ERROR),
errmsg("thesaurus sample word \"%s\" is a stop word (rule %d)",
d->wrds[i].lexeme,
d->wrds[i].entries->idsubst + 1),
errhint("Use \"?\" to represent a stop word within a sample phrase.")));
else
{
while (ptr->lexeme)
{
TSLexeme *remptr = ptr + 1;
int tnvar = 1;
int curvar = ptr->nvariant;
/* compute n words in one variant */
while (remptr->lexeme)
{
if (remptr->nvariant != (remptr - 1)->nvariant)
break;
tnvar++;
remptr++;
}
remptr = ptr;
while (remptr->lexeme && remptr->nvariant == curvar)
{
newwrds = addCompiledLexeme(newwrds, &nnw, &tnm, remptr, d->wrds[i].entries, tnvar);
remptr++;
}
ptr = remptr;
}
}
}
pfree(d->wrds[i].lexeme);
pfree(d->wrds[i].entries);
}
if (d->wrds)
pfree(d->wrds);
d->wrds = newwrds;
d->nwrds = nnw;
d->ntwrds = tnm;
if (d->nwrds > 1)
{
qsort(d->wrds, d->nwrds, sizeof(TheLexeme), cmpTheLexeme);
/* uniq */
newwrds = d->wrds;
ptrwrds = d->wrds + 1;
while (ptrwrds - d->wrds < d->nwrds)
{
if (cmpLexeme(ptrwrds, newwrds) == 0)
{
if (cmpLexemeInfo(ptrwrds->entries, newwrds->entries))
{
ptrwrds->entries->nextentry = newwrds->entries;
newwrds->entries = ptrwrds->entries;
}
else
pfree(ptrwrds->entries);
if (ptrwrds->lexeme)
pfree(ptrwrds->lexeme);
}
else
{
newwrds++;
*newwrds = *ptrwrds;
}
ptrwrds++;
}
d->nwrds = newwrds - d->wrds + 1;
d->wrds = (TheLexeme *) repalloc(d->wrds, sizeof(TheLexeme) * d->nwrds);
}
}
static void
compileTheSubstitute(DictThesaurus *d)
{
int i;
for (i = 0; i < d->nsubst; i++)
{
TSLexeme *rem = d->subst[i].res,
*outptr,
*inptr;
int n = 2;
outptr = d->subst[i].res = (TSLexeme *) palloc(sizeof(TSLexeme) * n);
outptr->lexeme = NULL;
inptr = rem;
while (inptr && inptr->lexeme)
{
TSLexeme *lexized,
tmplex[2];
if (inptr->flags & DT_USEASIS)
{ /* do not lexize */
tmplex[0] = *inptr;
tmplex[0].flags = 0;
tmplex[1].lexeme = NULL;
lexized = tmplex;
}
else
{
lexized = (TSLexeme *) DatumGetPointer(FunctionCall4(&(d->subdict->lexize),
PointerGetDatum(d->subdict->dictData),
PointerGetDatum(inptr->lexeme),
Int32GetDatum(strlen(inptr->lexeme)),
PointerGetDatum(NULL)));
}
if (lexized && lexized->lexeme)
{
int toset = (lexized->lexeme && outptr != d->subst[i].res) ? (outptr - d->subst[i].res) : -1;
while (lexized->lexeme)
{
if (outptr - d->subst[i].res + 1 >= n)
{
int diff = outptr - d->subst[i].res;
n *= 2;
d->subst[i].res = (TSLexeme *) repalloc(d->subst[i].res, sizeof(TSLexeme) * n);
outptr = d->subst[i].res + diff;
}
*outptr = *lexized;
outptr->lexeme = pstrdup(lexized->lexeme);
outptr++;
lexized++;
}
if (toset > 0)
d->subst[i].res[toset].flags |= TSL_ADDPOS;
}
else if (lexized)
{
ereport(ERROR,
(errcode(ERRCODE_CONFIG_FILE_ERROR),
errmsg("thesaurus substitute word \"%s\" is a stop word (rule %d)",
inptr->lexeme, i + 1)));
}
else
{
ereport(ERROR,
(errcode(ERRCODE_CONFIG_FILE_ERROR),
errmsg("thesaurus substitute word \"%s\" isn't recognized by subdictionary (rule %d)",
inptr->lexeme, i + 1)));
}
if (inptr->lexeme)
pfree(inptr->lexeme);
inptr++;
}
if (outptr == d->subst[i].res)
ereport(ERROR,
(errcode(ERRCODE_CONFIG_FILE_ERROR),
errmsg("thesaurus substitute phrase is empty (rule %d)",
i + 1)));
d->subst[i].reslen = outptr - d->subst[i].res;
pfree(rem);
}
}
Datum
thesaurus_init(PG_FUNCTION_ARGS)
{
List *dictoptions = (List *) PG_GETARG_POINTER(0);
DictThesaurus *d;
char *subdictname = NULL;
bool fileloaded = false;
ListCell *l;
d = (DictThesaurus *) palloc0(sizeof(DictThesaurus));
foreach(l, dictoptions)
{
DefElem *defel = (DefElem *) lfirst(l);
if (strcmp(defel->defname, "dictfile") == 0)
{
if (fileloaded)
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("multiple DictFile parameters")));
thesaurusRead(defGetString(defel), d);
fileloaded = true;
}
else if (strcmp(defel->defname, "dictionary") == 0)
{
if (subdictname)
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("multiple Dictionary parameters")));
subdictname = pstrdup(defGetString(defel));
}
else
{
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("unrecognized Thesaurus parameter: \"%s\"",
defel->defname)));
}
}
if (!fileloaded)
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("missing DictFile parameter")));
if (!subdictname)
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("missing Dictionary parameter")));
d->subdictOid = get_ts_dict_oid(stringToQualifiedNameList(subdictname), false);
d->subdict = lookup_ts_dictionary_cache(d->subdictOid);
compileTheLexeme(d);
compileTheSubstitute(d);
PG_RETURN_POINTER(d);
}
static LexemeInfo *
findTheLexeme(DictThesaurus *d, char *lexeme)
{
TheLexeme key,
*res;
if (d->nwrds == 0)
return NULL;
key.lexeme = lexeme;
key.entries = NULL;
res = bsearch(&key, d->wrds, d->nwrds, sizeof(TheLexeme), cmpLexemeQ);
if (res == NULL)
return NULL;
return res->entries;
}
static bool
matchIdSubst(LexemeInfo *stored, uint32 idsubst)
{
bool res = true;
if (stored)
{
res = false;
for (; stored; stored = stored->nextvariant)
if (stored->idsubst == idsubst)
{
res = true;
break;
}
}
return res;
}
static LexemeInfo *
findVariant(LexemeInfo *in, LexemeInfo *stored, uint16 curpos, LexemeInfo **newin, int newn)
{
for (;;)
{
int i;
LexemeInfo *ptr = newin[0];
for (i = 0; i < newn; i++)
{
while (newin[i] && newin[i]->idsubst < ptr->idsubst)
newin[i] = newin[i]->nextentry;
if (newin[i] == NULL)
return in;
if (newin[i]->idsubst > ptr->idsubst)
{
ptr = newin[i];
i = -1;
continue;
}
while (newin[i]->idsubst == ptr->idsubst)
{
if (newin[i]->posinsubst == curpos && newin[i]->tnvariant == newn)
{
ptr = newin[i];
break;
}
newin[i] = newin[i]->nextentry;
if (newin[i] == NULL)
return in;
}
if (newin[i]->idsubst != ptr->idsubst)
{
ptr = newin[i];
i = -1;
continue;
}
}
if (i == newn && matchIdSubst(stored, ptr->idsubst) && (in == NULL || !matchIdSubst(in, ptr->idsubst)))
{ /* found */
ptr->nextvariant = in;
in = ptr;
}
/* step forward */
for (i = 0; i < newn; i++)
newin[i] = newin[i]->nextentry;
}
}
static TSLexeme *
copyTSLexeme(TheSubstitute *ts)
{
TSLexeme *res;
uint16 i;
res = (TSLexeme *) palloc(sizeof(TSLexeme) * (ts->reslen + 1));
for (i = 0; i < ts->reslen; i++)
{
res[i] = ts->res[i];
res[i].lexeme = pstrdup(ts->res[i].lexeme);
}
res[ts->reslen].lexeme = NULL;
return res;
}
static TSLexeme *
checkMatch(DictThesaurus *d, LexemeInfo *info, uint16 curpos, bool *moreres)
{
*moreres = false;
while (info)
{
Assert(info->idsubst < d->nsubst);
if (info->nextvariant)
*moreres = true;
if (d->subst[info->idsubst].lastlexeme == curpos)
return copyTSLexeme(d->subst + info->idsubst);
info = info->nextvariant;
}
return NULL;
}
Datum
thesaurus_lexize(PG_FUNCTION_ARGS)
{
DictThesaurus *d = (DictThesaurus *) PG_GETARG_POINTER(0);
DictSubState *dstate = (DictSubState *) PG_GETARG_POINTER(3);
TSLexeme *res = NULL;
LexemeInfo *stored,
*info = NULL;
uint16 curpos = 0;
bool moreres = false;
if (PG_NARGS() != 4 || dstate == NULL)
elog(ERROR, "forbidden call of thesaurus or nested call");
if (dstate->isend)
PG_RETURN_POINTER(NULL);
stored = (LexemeInfo *) dstate->private_state;
if (stored)
curpos = stored->posinsubst + 1;
if (!d->subdict->isvalid)
d->subdict = lookup_ts_dictionary_cache(d->subdictOid);
res = (TSLexeme *) DatumGetPointer(FunctionCall4(&(d->subdict->lexize),
PointerGetDatum(d->subdict->dictData),
PG_GETARG_DATUM(1),
PG_GETARG_DATUM(2),
PointerGetDatum(NULL)));
if (res && res->lexeme)
{
TSLexeme *ptr = res,
*basevar;
while (ptr->lexeme)
{
uint16 nv = ptr->nvariant;
uint16 i,
nlex = 0;
LexemeInfo **infos;
basevar = ptr;
while (ptr->lexeme && nv == ptr->nvariant)
{
nlex++;
ptr++;
}
infos = (LexemeInfo **) palloc(sizeof(LexemeInfo *) * nlex);
for (i = 0; i < nlex; i++)
if ((infos[i] = findTheLexeme(d, basevar[i].lexeme)) == NULL)
break;
if (i < nlex)
{
/* no chance to find */
pfree(infos);
continue;
}
info = findVariant(info, stored, curpos, infos, nlex);
}
}
else if (res)
{ /* stop-word */
LexemeInfo *infos = findTheLexeme(d, NULL);
info = findVariant(NULL, stored, curpos, &infos, 1);
}
else
{
info = NULL; /* word isn't recognized */
}
dstate->private_state = (void *) info;
if (!info)
{
dstate->getnext = false;
PG_RETURN_POINTER(NULL);
}
if ((res = checkMatch(d, info, curpos, &moreres)) != NULL)
{
dstate->getnext = moreres;
PG_RETURN_POINTER(res);
}
dstate->getnext = true;
PG_RETURN_POINTER(NULL);
}