mirror of
https://github.com/postgres/postgres.git
synced 2025-07-05 07:21:24 +03:00
We used to strategically place newlines after some function call left parentheses to make pgindent move the argument list a few chars to the left, so that the whole line would fit under 80 chars. However, pgindent no longer does that, so the newlines just made the code vertically longer for no reason. Remove those newlines, and reflow some of those lines for some extra naturality. Reviewed-by: Michael Paquier, Tom Lane Discussion: https://postgr.es/m/20200129200401.GA6303@alvherre.pgsql
668 lines
14 KiB
C
668 lines
14 KiB
C
/*-------------------------------------------------------------------------
|
|
*
|
|
* ts_parse.c
|
|
* main parse functions for tsearch
|
|
*
|
|
* Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
|
|
*
|
|
*
|
|
* IDENTIFICATION
|
|
* src/backend/tsearch/ts_parse.c
|
|
*
|
|
*-------------------------------------------------------------------------
|
|
*/
|
|
|
|
#include "postgres.h"
|
|
|
|
#include "tsearch/ts_cache.h"
|
|
#include "tsearch/ts_utils.h"
|
|
|
|
#define IGNORE_LONGLEXEME 1
|
|
|
|
/*
|
|
* Lexize subsystem
|
|
*/
|
|
|
|
typedef struct ParsedLex
|
|
{
|
|
int type;
|
|
char *lemm;
|
|
int lenlemm;
|
|
struct ParsedLex *next;
|
|
} ParsedLex;
|
|
|
|
typedef struct ListParsedLex
|
|
{
|
|
ParsedLex *head;
|
|
ParsedLex *tail;
|
|
} ListParsedLex;
|
|
|
|
typedef struct
|
|
{
|
|
TSConfigCacheEntry *cfg;
|
|
Oid curDictId;
|
|
int posDict;
|
|
DictSubState dictState;
|
|
ParsedLex *curSub;
|
|
ListParsedLex towork; /* current list to work */
|
|
ListParsedLex waste; /* list of lexemes that already lexized */
|
|
|
|
/*
|
|
* fields to store last variant to lexize (basically, thesaurus or similar
|
|
* to, which wants several lexemes
|
|
*/
|
|
|
|
ParsedLex *lastRes;
|
|
TSLexeme *tmpRes;
|
|
} LexizeData;
|
|
|
|
static void
|
|
LexizeInit(LexizeData *ld, TSConfigCacheEntry *cfg)
|
|
{
|
|
ld->cfg = cfg;
|
|
ld->curDictId = InvalidOid;
|
|
ld->posDict = 0;
|
|
ld->towork.head = ld->towork.tail = ld->curSub = NULL;
|
|
ld->waste.head = ld->waste.tail = NULL;
|
|
ld->lastRes = NULL;
|
|
ld->tmpRes = NULL;
|
|
}
|
|
|
|
static void
|
|
LPLAddTail(ListParsedLex *list, ParsedLex *newpl)
|
|
{
|
|
if (list->tail)
|
|
{
|
|
list->tail->next = newpl;
|
|
list->tail = newpl;
|
|
}
|
|
else
|
|
list->head = list->tail = newpl;
|
|
newpl->next = NULL;
|
|
}
|
|
|
|
static ParsedLex *
|
|
LPLRemoveHead(ListParsedLex *list)
|
|
{
|
|
ParsedLex *res = list->head;
|
|
|
|
if (list->head)
|
|
list->head = list->head->next;
|
|
|
|
if (list->head == NULL)
|
|
list->tail = NULL;
|
|
|
|
return res;
|
|
}
|
|
|
|
static void
|
|
LexizeAddLemm(LexizeData *ld, int type, char *lemm, int lenlemm)
|
|
{
|
|
ParsedLex *newpl = (ParsedLex *) palloc(sizeof(ParsedLex));
|
|
|
|
newpl->type = type;
|
|
newpl->lemm = lemm;
|
|
newpl->lenlemm = lenlemm;
|
|
LPLAddTail(&ld->towork, newpl);
|
|
ld->curSub = ld->towork.tail;
|
|
}
|
|
|
|
static void
|
|
RemoveHead(LexizeData *ld)
|
|
{
|
|
LPLAddTail(&ld->waste, LPLRemoveHead(&ld->towork));
|
|
|
|
ld->posDict = 0;
|
|
}
|
|
|
|
static void
|
|
setCorrLex(LexizeData *ld, ParsedLex **correspondLexem)
|
|
{
|
|
if (correspondLexem)
|
|
{
|
|
*correspondLexem = ld->waste.head;
|
|
}
|
|
else
|
|
{
|
|
ParsedLex *tmp,
|
|
*ptr = ld->waste.head;
|
|
|
|
while (ptr)
|
|
{
|
|
tmp = ptr->next;
|
|
pfree(ptr);
|
|
ptr = tmp;
|
|
}
|
|
}
|
|
ld->waste.head = ld->waste.tail = NULL;
|
|
}
|
|
|
|
static void
|
|
moveToWaste(LexizeData *ld, ParsedLex *stop)
|
|
{
|
|
bool go = true;
|
|
|
|
while (ld->towork.head && go)
|
|
{
|
|
if (ld->towork.head == stop)
|
|
{
|
|
ld->curSub = stop->next;
|
|
go = false;
|
|
}
|
|
RemoveHead(ld);
|
|
}
|
|
}
|
|
|
|
static void
|
|
setNewTmpRes(LexizeData *ld, ParsedLex *lex, TSLexeme *res)
|
|
{
|
|
if (ld->tmpRes)
|
|
{
|
|
TSLexeme *ptr;
|
|
|
|
for (ptr = ld->tmpRes; ptr->lexeme; ptr++)
|
|
pfree(ptr->lexeme);
|
|
pfree(ld->tmpRes);
|
|
}
|
|
ld->tmpRes = res;
|
|
ld->lastRes = lex;
|
|
}
|
|
|
|
static TSLexeme *
|
|
LexizeExec(LexizeData *ld, ParsedLex **correspondLexem)
|
|
{
|
|
int i;
|
|
ListDictionary *map;
|
|
TSDictionaryCacheEntry *dict;
|
|
TSLexeme *res;
|
|
|
|
if (ld->curDictId == InvalidOid)
|
|
{
|
|
/*
|
|
* usual mode: dictionary wants only one word, but we should keep in
|
|
* mind that we should go through all stack
|
|
*/
|
|
|
|
while (ld->towork.head)
|
|
{
|
|
ParsedLex *curVal = ld->towork.head;
|
|
char *curValLemm = curVal->lemm;
|
|
int curValLenLemm = curVal->lenlemm;
|
|
|
|
map = ld->cfg->map + curVal->type;
|
|
|
|
if (curVal->type == 0 || curVal->type >= ld->cfg->lenmap || map->len == 0)
|
|
{
|
|
/* skip this type of lexeme */
|
|
RemoveHead(ld);
|
|
continue;
|
|
}
|
|
|
|
for (i = ld->posDict; i < map->len; i++)
|
|
{
|
|
dict = lookup_ts_dictionary_cache(map->dictIds[i]);
|
|
|
|
ld->dictState.isend = ld->dictState.getnext = false;
|
|
ld->dictState.private_state = NULL;
|
|
res = (TSLexeme *) DatumGetPointer(FunctionCall4(&(dict->lexize),
|
|
PointerGetDatum(dict->dictData),
|
|
PointerGetDatum(curValLemm),
|
|
Int32GetDatum(curValLenLemm),
|
|
PointerGetDatum(&ld->dictState)));
|
|
|
|
if (ld->dictState.getnext)
|
|
{
|
|
/*
|
|
* dictionary wants next word, so setup and store current
|
|
* position and go to multiword mode
|
|
*/
|
|
|
|
ld->curDictId = DatumGetObjectId(map->dictIds[i]);
|
|
ld->posDict = i + 1;
|
|
ld->curSub = curVal->next;
|
|
if (res)
|
|
setNewTmpRes(ld, curVal, res);
|
|
return LexizeExec(ld, correspondLexem);
|
|
}
|
|
|
|
if (!res) /* dictionary doesn't know this lexeme */
|
|
continue;
|
|
|
|
if (res->flags & TSL_FILTER)
|
|
{
|
|
curValLemm = res->lexeme;
|
|
curValLenLemm = strlen(res->lexeme);
|
|
continue;
|
|
}
|
|
|
|
RemoveHead(ld);
|
|
setCorrLex(ld, correspondLexem);
|
|
return res;
|
|
}
|
|
|
|
RemoveHead(ld);
|
|
}
|
|
}
|
|
else
|
|
{ /* curDictId is valid */
|
|
dict = lookup_ts_dictionary_cache(ld->curDictId);
|
|
|
|
/*
|
|
* Dictionary ld->curDictId asks us about following words
|
|
*/
|
|
|
|
while (ld->curSub)
|
|
{
|
|
ParsedLex *curVal = ld->curSub;
|
|
|
|
map = ld->cfg->map + curVal->type;
|
|
|
|
if (curVal->type != 0)
|
|
{
|
|
bool dictExists = false;
|
|
|
|
if (curVal->type >= ld->cfg->lenmap || map->len == 0)
|
|
{
|
|
/* skip this type of lexeme */
|
|
ld->curSub = curVal->next;
|
|
continue;
|
|
}
|
|
|
|
/*
|
|
* We should be sure that current type of lexeme is recognized
|
|
* by our dictionary: we just check is it exist in list of
|
|
* dictionaries ?
|
|
*/
|
|
for (i = 0; i < map->len && !dictExists; i++)
|
|
if (ld->curDictId == DatumGetObjectId(map->dictIds[i]))
|
|
dictExists = true;
|
|
|
|
if (!dictExists)
|
|
{
|
|
/*
|
|
* Dictionary can't work with current type of lexeme,
|
|
* return to basic mode and redo all stored lexemes
|
|
*/
|
|
ld->curDictId = InvalidOid;
|
|
return LexizeExec(ld, correspondLexem);
|
|
}
|
|
}
|
|
|
|
ld->dictState.isend = (curVal->type == 0) ? true : false;
|
|
ld->dictState.getnext = false;
|
|
|
|
res = (TSLexeme *) DatumGetPointer(FunctionCall4(&(dict->lexize),
|
|
PointerGetDatum(dict->dictData),
|
|
PointerGetDatum(curVal->lemm),
|
|
Int32GetDatum(curVal->lenlemm),
|
|
PointerGetDatum(&ld->dictState)));
|
|
|
|
if (ld->dictState.getnext)
|
|
{
|
|
/* Dictionary wants one more */
|
|
ld->curSub = curVal->next;
|
|
if (res)
|
|
setNewTmpRes(ld, curVal, res);
|
|
continue;
|
|
}
|
|
|
|
if (res || ld->tmpRes)
|
|
{
|
|
/*
|
|
* Dictionary normalizes lexemes, so we remove from stack all
|
|
* used lexemes, return to basic mode and redo end of stack
|
|
* (if it exists)
|
|
*/
|
|
if (res)
|
|
{
|
|
moveToWaste(ld, ld->curSub);
|
|
}
|
|
else
|
|
{
|
|
res = ld->tmpRes;
|
|
moveToWaste(ld, ld->lastRes);
|
|
}
|
|
|
|
/* reset to initial state */
|
|
ld->curDictId = InvalidOid;
|
|
ld->posDict = 0;
|
|
ld->lastRes = NULL;
|
|
ld->tmpRes = NULL;
|
|
setCorrLex(ld, correspondLexem);
|
|
return res;
|
|
}
|
|
|
|
/*
|
|
* Dict don't want next lexem and didn't recognize anything, redo
|
|
* from ld->towork.head
|
|
*/
|
|
ld->curDictId = InvalidOid;
|
|
return LexizeExec(ld, correspondLexem);
|
|
}
|
|
}
|
|
|
|
setCorrLex(ld, correspondLexem);
|
|
return NULL;
|
|
}
|
|
|
|
/*
|
|
* Parse string and lexize words.
|
|
*
|
|
* prs will be filled in.
|
|
*/
|
|
void
|
|
parsetext(Oid cfgId, ParsedText *prs, char *buf, int buflen)
|
|
{
|
|
int type,
|
|
lenlemm;
|
|
char *lemm = NULL;
|
|
LexizeData ldata;
|
|
TSLexeme *norms;
|
|
TSConfigCacheEntry *cfg;
|
|
TSParserCacheEntry *prsobj;
|
|
void *prsdata;
|
|
|
|
cfg = lookup_ts_config_cache(cfgId);
|
|
prsobj = lookup_ts_parser_cache(cfg->prsId);
|
|
|
|
prsdata = (void *) DatumGetPointer(FunctionCall2(&prsobj->prsstart,
|
|
PointerGetDatum(buf),
|
|
Int32GetDatum(buflen)));
|
|
|
|
LexizeInit(&ldata, cfg);
|
|
|
|
do
|
|
{
|
|
type = DatumGetInt32(FunctionCall3(&(prsobj->prstoken),
|
|
PointerGetDatum(prsdata),
|
|
PointerGetDatum(&lemm),
|
|
PointerGetDatum(&lenlemm)));
|
|
|
|
if (type > 0 && lenlemm >= MAXSTRLEN)
|
|
{
|
|
#ifdef IGNORE_LONGLEXEME
|
|
ereport(NOTICE,
|
|
(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
|
|
errmsg("word is too long to be indexed"),
|
|
errdetail("Words longer than %d characters are ignored.",
|
|
MAXSTRLEN)));
|
|
continue;
|
|
#else
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
|
|
errmsg("word is too long to be indexed"),
|
|
errdetail("Words longer than %d characters are ignored.",
|
|
MAXSTRLEN)));
|
|
#endif
|
|
}
|
|
|
|
LexizeAddLemm(&ldata, type, lemm, lenlemm);
|
|
|
|
while ((norms = LexizeExec(&ldata, NULL)) != NULL)
|
|
{
|
|
TSLexeme *ptr = norms;
|
|
|
|
prs->pos++; /* set pos */
|
|
|
|
while (ptr->lexeme)
|
|
{
|
|
if (prs->curwords == prs->lenwords)
|
|
{
|
|
prs->lenwords *= 2;
|
|
prs->words = (ParsedWord *) repalloc((void *) prs->words, prs->lenwords * sizeof(ParsedWord));
|
|
}
|
|
|
|
if (ptr->flags & TSL_ADDPOS)
|
|
prs->pos++;
|
|
prs->words[prs->curwords].len = strlen(ptr->lexeme);
|
|
prs->words[prs->curwords].word = ptr->lexeme;
|
|
prs->words[prs->curwords].nvariant = ptr->nvariant;
|
|
prs->words[prs->curwords].flags = ptr->flags & TSL_PREFIX;
|
|
prs->words[prs->curwords].alen = 0;
|
|
prs->words[prs->curwords].pos.pos = LIMITPOS(prs->pos);
|
|
ptr++;
|
|
prs->curwords++;
|
|
}
|
|
pfree(norms);
|
|
}
|
|
} while (type > 0);
|
|
|
|
FunctionCall1(&(prsobj->prsend), PointerGetDatum(prsdata));
|
|
}
|
|
|
|
/*
|
|
* Headline framework
|
|
*/
|
|
static void
|
|
hladdword(HeadlineParsedText *prs, char *buf, int buflen, int type)
|
|
{
|
|
while (prs->curwords >= prs->lenwords)
|
|
{
|
|
prs->lenwords *= 2;
|
|
prs->words = (HeadlineWordEntry *) repalloc((void *) prs->words, prs->lenwords * sizeof(HeadlineWordEntry));
|
|
}
|
|
memset(&(prs->words[prs->curwords]), 0, sizeof(HeadlineWordEntry));
|
|
prs->words[prs->curwords].type = (uint8) type;
|
|
prs->words[prs->curwords].len = buflen;
|
|
prs->words[prs->curwords].word = palloc(buflen);
|
|
memcpy(prs->words[prs->curwords].word, buf, buflen);
|
|
prs->curwords++;
|
|
}
|
|
|
|
static void
|
|
hlfinditem(HeadlineParsedText *prs, TSQuery query, int32 pos, char *buf, int buflen)
|
|
{
|
|
int i;
|
|
QueryItem *item = GETQUERY(query);
|
|
HeadlineWordEntry *word;
|
|
|
|
while (prs->curwords + query->size >= prs->lenwords)
|
|
{
|
|
prs->lenwords *= 2;
|
|
prs->words = (HeadlineWordEntry *) repalloc((void *) prs->words, prs->lenwords * sizeof(HeadlineWordEntry));
|
|
}
|
|
|
|
word = &(prs->words[prs->curwords - 1]);
|
|
word->pos = LIMITPOS(pos);
|
|
for (i = 0; i < query->size; i++)
|
|
{
|
|
if (item->type == QI_VAL &&
|
|
tsCompareString(GETOPERAND(query) + item->qoperand.distance, item->qoperand.length,
|
|
buf, buflen, item->qoperand.prefix) == 0)
|
|
{
|
|
if (word->item)
|
|
{
|
|
memcpy(&(prs->words[prs->curwords]), word, sizeof(HeadlineWordEntry));
|
|
prs->words[prs->curwords].item = &item->qoperand;
|
|
prs->words[prs->curwords].repeated = 1;
|
|
prs->curwords++;
|
|
}
|
|
else
|
|
word->item = &item->qoperand;
|
|
}
|
|
item++;
|
|
}
|
|
}
|
|
|
|
static void
|
|
addHLParsedLex(HeadlineParsedText *prs, TSQuery query, ParsedLex *lexs, TSLexeme *norms)
|
|
{
|
|
ParsedLex *tmplexs;
|
|
TSLexeme *ptr;
|
|
int32 savedpos;
|
|
|
|
while (lexs)
|
|
{
|
|
if (lexs->type > 0)
|
|
hladdword(prs, lexs->lemm, lexs->lenlemm, lexs->type);
|
|
|
|
ptr = norms;
|
|
savedpos = prs->vectorpos;
|
|
while (ptr && ptr->lexeme)
|
|
{
|
|
if (ptr->flags & TSL_ADDPOS)
|
|
savedpos++;
|
|
hlfinditem(prs, query, savedpos, ptr->lexeme, strlen(ptr->lexeme));
|
|
ptr++;
|
|
}
|
|
|
|
tmplexs = lexs->next;
|
|
pfree(lexs);
|
|
lexs = tmplexs;
|
|
}
|
|
|
|
if (norms)
|
|
{
|
|
ptr = norms;
|
|
while (ptr->lexeme)
|
|
{
|
|
if (ptr->flags & TSL_ADDPOS)
|
|
prs->vectorpos++;
|
|
pfree(ptr->lexeme);
|
|
ptr++;
|
|
}
|
|
pfree(norms);
|
|
}
|
|
}
|
|
|
|
void
|
|
hlparsetext(Oid cfgId, HeadlineParsedText *prs, TSQuery query, char *buf, int buflen)
|
|
{
|
|
int type,
|
|
lenlemm;
|
|
char *lemm = NULL;
|
|
LexizeData ldata;
|
|
TSLexeme *norms;
|
|
ParsedLex *lexs;
|
|
TSConfigCacheEntry *cfg;
|
|
TSParserCacheEntry *prsobj;
|
|
void *prsdata;
|
|
|
|
cfg = lookup_ts_config_cache(cfgId);
|
|
prsobj = lookup_ts_parser_cache(cfg->prsId);
|
|
|
|
prsdata = (void *) DatumGetPointer(FunctionCall2(&(prsobj->prsstart),
|
|
PointerGetDatum(buf),
|
|
Int32GetDatum(buflen)));
|
|
|
|
LexizeInit(&ldata, cfg);
|
|
|
|
do
|
|
{
|
|
type = DatumGetInt32(FunctionCall3(&(prsobj->prstoken),
|
|
PointerGetDatum(prsdata),
|
|
PointerGetDatum(&lemm),
|
|
PointerGetDatum(&lenlemm)));
|
|
|
|
if (type > 0 && lenlemm >= MAXSTRLEN)
|
|
{
|
|
#ifdef IGNORE_LONGLEXEME
|
|
ereport(NOTICE,
|
|
(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
|
|
errmsg("word is too long to be indexed"),
|
|
errdetail("Words longer than %d characters are ignored.",
|
|
MAXSTRLEN)));
|
|
continue;
|
|
#else
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
|
|
errmsg("word is too long to be indexed"),
|
|
errdetail("Words longer than %d characters are ignored.",
|
|
MAXSTRLEN)));
|
|
#endif
|
|
}
|
|
|
|
LexizeAddLemm(&ldata, type, lemm, lenlemm);
|
|
|
|
do
|
|
{
|
|
if ((norms = LexizeExec(&ldata, &lexs)) != NULL)
|
|
{
|
|
prs->vectorpos++;
|
|
addHLParsedLex(prs, query, lexs, norms);
|
|
}
|
|
else
|
|
addHLParsedLex(prs, query, lexs, NULL);
|
|
} while (norms);
|
|
|
|
} while (type > 0);
|
|
|
|
FunctionCall1(&(prsobj->prsend), PointerGetDatum(prsdata));
|
|
}
|
|
|
|
text *
|
|
generateHeadline(HeadlineParsedText *prs)
|
|
{
|
|
text *out;
|
|
char *ptr;
|
|
int len = 128;
|
|
int numfragments = 0;
|
|
int16 infrag = 0;
|
|
|
|
HeadlineWordEntry *wrd = prs->words;
|
|
|
|
out = (text *) palloc(len);
|
|
ptr = ((char *) out) + VARHDRSZ;
|
|
|
|
while (wrd - prs->words < prs->curwords)
|
|
{
|
|
while (wrd->len + prs->stopsellen + prs->startsellen + prs->fragdelimlen + (ptr - ((char *) out)) >= len)
|
|
{
|
|
int dist = ptr - ((char *) out);
|
|
|
|
len *= 2;
|
|
out = (text *) repalloc(out, len);
|
|
ptr = ((char *) out) + dist;
|
|
}
|
|
|
|
if (wrd->in && !wrd->repeated)
|
|
{
|
|
if (!infrag)
|
|
{
|
|
|
|
/* start of a new fragment */
|
|
infrag = 1;
|
|
numfragments++;
|
|
/* add a fragment delimiter if this is after the first one */
|
|
if (numfragments > 1)
|
|
{
|
|
memcpy(ptr, prs->fragdelim, prs->fragdelimlen);
|
|
ptr += prs->fragdelimlen;
|
|
}
|
|
|
|
}
|
|
if (wrd->replace)
|
|
{
|
|
*ptr = ' ';
|
|
ptr++;
|
|
}
|
|
else if (!wrd->skip)
|
|
{
|
|
if (wrd->selected)
|
|
{
|
|
memcpy(ptr, prs->startsel, prs->startsellen);
|
|
ptr += prs->startsellen;
|
|
}
|
|
memcpy(ptr, wrd->word, wrd->len);
|
|
ptr += wrd->len;
|
|
if (wrd->selected)
|
|
{
|
|
memcpy(ptr, prs->stopsel, prs->stopsellen);
|
|
ptr += prs->stopsellen;
|
|
}
|
|
}
|
|
}
|
|
else if (!wrd->repeated)
|
|
{
|
|
if (infrag)
|
|
infrag = 0;
|
|
pfree(wrd->word);
|
|
}
|
|
|
|
wrd++;
|
|
}
|
|
|
|
SET_VARSIZE(out, ptr - ((char *) out));
|
|
return out;
|
|
}
|