1
0
mirror of https://github.com/postgres/postgres.git synced 2025-07-05 07:21:24 +03:00
Files
postgres/src/backend/tsearch/ts_parse.c
Alvaro Herrera c9d2977519 Clean up newlines following left parentheses
We used to strategically place newlines after some function call left
parentheses to make pgindent move the argument list a few chars to the
left, so that the whole line would fit under 80 chars.  However,
pgindent no longer does that, so the newlines just made the code
vertically longer for no reason.  Remove those newlines, and reflow some
of those lines for some extra naturality.

Reviewed-by: Michael Paquier, Tom Lane
Discussion: https://postgr.es/m/20200129200401.GA6303@alvherre.pgsql
2020-01-30 13:42:14 -03:00

668 lines
14 KiB
C

/*-------------------------------------------------------------------------
*
* ts_parse.c
* main parse functions for tsearch
*
* Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
*
*
* IDENTIFICATION
* src/backend/tsearch/ts_parse.c
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "tsearch/ts_cache.h"
#include "tsearch/ts_utils.h"
#define IGNORE_LONGLEXEME 1
/*
* Lexize subsystem
*/
typedef struct ParsedLex
{
int type;
char *lemm;
int lenlemm;
struct ParsedLex *next;
} ParsedLex;
typedef struct ListParsedLex
{
ParsedLex *head;
ParsedLex *tail;
} ListParsedLex;
typedef struct
{
TSConfigCacheEntry *cfg;
Oid curDictId;
int posDict;
DictSubState dictState;
ParsedLex *curSub;
ListParsedLex towork; /* current list to work */
ListParsedLex waste; /* list of lexemes that already lexized */
/*
* fields to store last variant to lexize (basically, thesaurus or similar
* to, which wants several lexemes
*/
ParsedLex *lastRes;
TSLexeme *tmpRes;
} LexizeData;
static void
LexizeInit(LexizeData *ld, TSConfigCacheEntry *cfg)
{
ld->cfg = cfg;
ld->curDictId = InvalidOid;
ld->posDict = 0;
ld->towork.head = ld->towork.tail = ld->curSub = NULL;
ld->waste.head = ld->waste.tail = NULL;
ld->lastRes = NULL;
ld->tmpRes = NULL;
}
static void
LPLAddTail(ListParsedLex *list, ParsedLex *newpl)
{
if (list->tail)
{
list->tail->next = newpl;
list->tail = newpl;
}
else
list->head = list->tail = newpl;
newpl->next = NULL;
}
static ParsedLex *
LPLRemoveHead(ListParsedLex *list)
{
ParsedLex *res = list->head;
if (list->head)
list->head = list->head->next;
if (list->head == NULL)
list->tail = NULL;
return res;
}
static void
LexizeAddLemm(LexizeData *ld, int type, char *lemm, int lenlemm)
{
ParsedLex *newpl = (ParsedLex *) palloc(sizeof(ParsedLex));
newpl->type = type;
newpl->lemm = lemm;
newpl->lenlemm = lenlemm;
LPLAddTail(&ld->towork, newpl);
ld->curSub = ld->towork.tail;
}
static void
RemoveHead(LexizeData *ld)
{
LPLAddTail(&ld->waste, LPLRemoveHead(&ld->towork));
ld->posDict = 0;
}
static void
setCorrLex(LexizeData *ld, ParsedLex **correspondLexem)
{
if (correspondLexem)
{
*correspondLexem = ld->waste.head;
}
else
{
ParsedLex *tmp,
*ptr = ld->waste.head;
while (ptr)
{
tmp = ptr->next;
pfree(ptr);
ptr = tmp;
}
}
ld->waste.head = ld->waste.tail = NULL;
}
static void
moveToWaste(LexizeData *ld, ParsedLex *stop)
{
bool go = true;
while (ld->towork.head && go)
{
if (ld->towork.head == stop)
{
ld->curSub = stop->next;
go = false;
}
RemoveHead(ld);
}
}
static void
setNewTmpRes(LexizeData *ld, ParsedLex *lex, TSLexeme *res)
{
if (ld->tmpRes)
{
TSLexeme *ptr;
for (ptr = ld->tmpRes; ptr->lexeme; ptr++)
pfree(ptr->lexeme);
pfree(ld->tmpRes);
}
ld->tmpRes = res;
ld->lastRes = lex;
}
static TSLexeme *
LexizeExec(LexizeData *ld, ParsedLex **correspondLexem)
{
int i;
ListDictionary *map;
TSDictionaryCacheEntry *dict;
TSLexeme *res;
if (ld->curDictId == InvalidOid)
{
/*
* usual mode: dictionary wants only one word, but we should keep in
* mind that we should go through all stack
*/
while (ld->towork.head)
{
ParsedLex *curVal = ld->towork.head;
char *curValLemm = curVal->lemm;
int curValLenLemm = curVal->lenlemm;
map = ld->cfg->map + curVal->type;
if (curVal->type == 0 || curVal->type >= ld->cfg->lenmap || map->len == 0)
{
/* skip this type of lexeme */
RemoveHead(ld);
continue;
}
for (i = ld->posDict; i < map->len; i++)
{
dict = lookup_ts_dictionary_cache(map->dictIds[i]);
ld->dictState.isend = ld->dictState.getnext = false;
ld->dictState.private_state = NULL;
res = (TSLexeme *) DatumGetPointer(FunctionCall4(&(dict->lexize),
PointerGetDatum(dict->dictData),
PointerGetDatum(curValLemm),
Int32GetDatum(curValLenLemm),
PointerGetDatum(&ld->dictState)));
if (ld->dictState.getnext)
{
/*
* dictionary wants next word, so setup and store current
* position and go to multiword mode
*/
ld->curDictId = DatumGetObjectId(map->dictIds[i]);
ld->posDict = i + 1;
ld->curSub = curVal->next;
if (res)
setNewTmpRes(ld, curVal, res);
return LexizeExec(ld, correspondLexem);
}
if (!res) /* dictionary doesn't know this lexeme */
continue;
if (res->flags & TSL_FILTER)
{
curValLemm = res->lexeme;
curValLenLemm = strlen(res->lexeme);
continue;
}
RemoveHead(ld);
setCorrLex(ld, correspondLexem);
return res;
}
RemoveHead(ld);
}
}
else
{ /* curDictId is valid */
dict = lookup_ts_dictionary_cache(ld->curDictId);
/*
* Dictionary ld->curDictId asks us about following words
*/
while (ld->curSub)
{
ParsedLex *curVal = ld->curSub;
map = ld->cfg->map + curVal->type;
if (curVal->type != 0)
{
bool dictExists = false;
if (curVal->type >= ld->cfg->lenmap || map->len == 0)
{
/* skip this type of lexeme */
ld->curSub = curVal->next;
continue;
}
/*
* We should be sure that current type of lexeme is recognized
* by our dictionary: we just check is it exist in list of
* dictionaries ?
*/
for (i = 0; i < map->len && !dictExists; i++)
if (ld->curDictId == DatumGetObjectId(map->dictIds[i]))
dictExists = true;
if (!dictExists)
{
/*
* Dictionary can't work with current type of lexeme,
* return to basic mode and redo all stored lexemes
*/
ld->curDictId = InvalidOid;
return LexizeExec(ld, correspondLexem);
}
}
ld->dictState.isend = (curVal->type == 0) ? true : false;
ld->dictState.getnext = false;
res = (TSLexeme *) DatumGetPointer(FunctionCall4(&(dict->lexize),
PointerGetDatum(dict->dictData),
PointerGetDatum(curVal->lemm),
Int32GetDatum(curVal->lenlemm),
PointerGetDatum(&ld->dictState)));
if (ld->dictState.getnext)
{
/* Dictionary wants one more */
ld->curSub = curVal->next;
if (res)
setNewTmpRes(ld, curVal, res);
continue;
}
if (res || ld->tmpRes)
{
/*
* Dictionary normalizes lexemes, so we remove from stack all
* used lexemes, return to basic mode and redo end of stack
* (if it exists)
*/
if (res)
{
moveToWaste(ld, ld->curSub);
}
else
{
res = ld->tmpRes;
moveToWaste(ld, ld->lastRes);
}
/* reset to initial state */
ld->curDictId = InvalidOid;
ld->posDict = 0;
ld->lastRes = NULL;
ld->tmpRes = NULL;
setCorrLex(ld, correspondLexem);
return res;
}
/*
* Dict don't want next lexem and didn't recognize anything, redo
* from ld->towork.head
*/
ld->curDictId = InvalidOid;
return LexizeExec(ld, correspondLexem);
}
}
setCorrLex(ld, correspondLexem);
return NULL;
}
/*
* Parse string and lexize words.
*
* prs will be filled in.
*/
void
parsetext(Oid cfgId, ParsedText *prs, char *buf, int buflen)
{
int type,
lenlemm;
char *lemm = NULL;
LexizeData ldata;
TSLexeme *norms;
TSConfigCacheEntry *cfg;
TSParserCacheEntry *prsobj;
void *prsdata;
cfg = lookup_ts_config_cache(cfgId);
prsobj = lookup_ts_parser_cache(cfg->prsId);
prsdata = (void *) DatumGetPointer(FunctionCall2(&prsobj->prsstart,
PointerGetDatum(buf),
Int32GetDatum(buflen)));
LexizeInit(&ldata, cfg);
do
{
type = DatumGetInt32(FunctionCall3(&(prsobj->prstoken),
PointerGetDatum(prsdata),
PointerGetDatum(&lemm),
PointerGetDatum(&lenlemm)));
if (type > 0 && lenlemm >= MAXSTRLEN)
{
#ifdef IGNORE_LONGLEXEME
ereport(NOTICE,
(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
errmsg("word is too long to be indexed"),
errdetail("Words longer than %d characters are ignored.",
MAXSTRLEN)));
continue;
#else
ereport(ERROR,
(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
errmsg("word is too long to be indexed"),
errdetail("Words longer than %d characters are ignored.",
MAXSTRLEN)));
#endif
}
LexizeAddLemm(&ldata, type, lemm, lenlemm);
while ((norms = LexizeExec(&ldata, NULL)) != NULL)
{
TSLexeme *ptr = norms;
prs->pos++; /* set pos */
while (ptr->lexeme)
{
if (prs->curwords == prs->lenwords)
{
prs->lenwords *= 2;
prs->words = (ParsedWord *) repalloc((void *) prs->words, prs->lenwords * sizeof(ParsedWord));
}
if (ptr->flags & TSL_ADDPOS)
prs->pos++;
prs->words[prs->curwords].len = strlen(ptr->lexeme);
prs->words[prs->curwords].word = ptr->lexeme;
prs->words[prs->curwords].nvariant = ptr->nvariant;
prs->words[prs->curwords].flags = ptr->flags & TSL_PREFIX;
prs->words[prs->curwords].alen = 0;
prs->words[prs->curwords].pos.pos = LIMITPOS(prs->pos);
ptr++;
prs->curwords++;
}
pfree(norms);
}
} while (type > 0);
FunctionCall1(&(prsobj->prsend), PointerGetDatum(prsdata));
}
/*
* Headline framework
*/
static void
hladdword(HeadlineParsedText *prs, char *buf, int buflen, int type)
{
while (prs->curwords >= prs->lenwords)
{
prs->lenwords *= 2;
prs->words = (HeadlineWordEntry *) repalloc((void *) prs->words, prs->lenwords * sizeof(HeadlineWordEntry));
}
memset(&(prs->words[prs->curwords]), 0, sizeof(HeadlineWordEntry));
prs->words[prs->curwords].type = (uint8) type;
prs->words[prs->curwords].len = buflen;
prs->words[prs->curwords].word = palloc(buflen);
memcpy(prs->words[prs->curwords].word, buf, buflen);
prs->curwords++;
}
static void
hlfinditem(HeadlineParsedText *prs, TSQuery query, int32 pos, char *buf, int buflen)
{
int i;
QueryItem *item = GETQUERY(query);
HeadlineWordEntry *word;
while (prs->curwords + query->size >= prs->lenwords)
{
prs->lenwords *= 2;
prs->words = (HeadlineWordEntry *) repalloc((void *) prs->words, prs->lenwords * sizeof(HeadlineWordEntry));
}
word = &(prs->words[prs->curwords - 1]);
word->pos = LIMITPOS(pos);
for (i = 0; i < query->size; i++)
{
if (item->type == QI_VAL &&
tsCompareString(GETOPERAND(query) + item->qoperand.distance, item->qoperand.length,
buf, buflen, item->qoperand.prefix) == 0)
{
if (word->item)
{
memcpy(&(prs->words[prs->curwords]), word, sizeof(HeadlineWordEntry));
prs->words[prs->curwords].item = &item->qoperand;
prs->words[prs->curwords].repeated = 1;
prs->curwords++;
}
else
word->item = &item->qoperand;
}
item++;
}
}
static void
addHLParsedLex(HeadlineParsedText *prs, TSQuery query, ParsedLex *lexs, TSLexeme *norms)
{
ParsedLex *tmplexs;
TSLexeme *ptr;
int32 savedpos;
while (lexs)
{
if (lexs->type > 0)
hladdword(prs, lexs->lemm, lexs->lenlemm, lexs->type);
ptr = norms;
savedpos = prs->vectorpos;
while (ptr && ptr->lexeme)
{
if (ptr->flags & TSL_ADDPOS)
savedpos++;
hlfinditem(prs, query, savedpos, ptr->lexeme, strlen(ptr->lexeme));
ptr++;
}
tmplexs = lexs->next;
pfree(lexs);
lexs = tmplexs;
}
if (norms)
{
ptr = norms;
while (ptr->lexeme)
{
if (ptr->flags & TSL_ADDPOS)
prs->vectorpos++;
pfree(ptr->lexeme);
ptr++;
}
pfree(norms);
}
}
void
hlparsetext(Oid cfgId, HeadlineParsedText *prs, TSQuery query, char *buf, int buflen)
{
int type,
lenlemm;
char *lemm = NULL;
LexizeData ldata;
TSLexeme *norms;
ParsedLex *lexs;
TSConfigCacheEntry *cfg;
TSParserCacheEntry *prsobj;
void *prsdata;
cfg = lookup_ts_config_cache(cfgId);
prsobj = lookup_ts_parser_cache(cfg->prsId);
prsdata = (void *) DatumGetPointer(FunctionCall2(&(prsobj->prsstart),
PointerGetDatum(buf),
Int32GetDatum(buflen)));
LexizeInit(&ldata, cfg);
do
{
type = DatumGetInt32(FunctionCall3(&(prsobj->prstoken),
PointerGetDatum(prsdata),
PointerGetDatum(&lemm),
PointerGetDatum(&lenlemm)));
if (type > 0 && lenlemm >= MAXSTRLEN)
{
#ifdef IGNORE_LONGLEXEME
ereport(NOTICE,
(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
errmsg("word is too long to be indexed"),
errdetail("Words longer than %d characters are ignored.",
MAXSTRLEN)));
continue;
#else
ereport(ERROR,
(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
errmsg("word is too long to be indexed"),
errdetail("Words longer than %d characters are ignored.",
MAXSTRLEN)));
#endif
}
LexizeAddLemm(&ldata, type, lemm, lenlemm);
do
{
if ((norms = LexizeExec(&ldata, &lexs)) != NULL)
{
prs->vectorpos++;
addHLParsedLex(prs, query, lexs, norms);
}
else
addHLParsedLex(prs, query, lexs, NULL);
} while (norms);
} while (type > 0);
FunctionCall1(&(prsobj->prsend), PointerGetDatum(prsdata));
}
text *
generateHeadline(HeadlineParsedText *prs)
{
text *out;
char *ptr;
int len = 128;
int numfragments = 0;
int16 infrag = 0;
HeadlineWordEntry *wrd = prs->words;
out = (text *) palloc(len);
ptr = ((char *) out) + VARHDRSZ;
while (wrd - prs->words < prs->curwords)
{
while (wrd->len + prs->stopsellen + prs->startsellen + prs->fragdelimlen + (ptr - ((char *) out)) >= len)
{
int dist = ptr - ((char *) out);
len *= 2;
out = (text *) repalloc(out, len);
ptr = ((char *) out) + dist;
}
if (wrd->in && !wrd->repeated)
{
if (!infrag)
{
/* start of a new fragment */
infrag = 1;
numfragments++;
/* add a fragment delimiter if this is after the first one */
if (numfragments > 1)
{
memcpy(ptr, prs->fragdelim, prs->fragdelimlen);
ptr += prs->fragdelimlen;
}
}
if (wrd->replace)
{
*ptr = ' ';
ptr++;
}
else if (!wrd->skip)
{
if (wrd->selected)
{
memcpy(ptr, prs->startsel, prs->startsellen);
ptr += prs->startsellen;
}
memcpy(ptr, wrd->word, wrd->len);
ptr += wrd->len;
if (wrd->selected)
{
memcpy(ptr, prs->stopsel, prs->stopsellen);
ptr += prs->stopsellen;
}
}
}
else if (!wrd->repeated)
{
if (infrag)
infrag = 0;
pfree(wrd->word);
}
wrd++;
}
SET_VARSIZE(out, ptr - ((char *) out));
return out;
}