mirror of
https://github.com/postgres/postgres.git
synced 2025-07-30 11:03:19 +03:00
Phrase full text search.
Patch introduces new text search operator (<-> or <DISTANCE>) into tsquery. On-disk and binary in/out format of tsquery are backward compatible. It has two side effect: - change order for tsquery, so, users, who has a btree index over tsquery, should reindex it - less number of parenthesis in tsquery output, and tsquery becomes more readable Authors: Teodor Sigaev, Oleg Bartunov, Dmitry Ivanov Reviewers: Alexander Korotkov, Artur Zakirov
This commit is contained in:
@ -18,6 +18,13 @@
|
||||
#include "utils/builtins.h"
|
||||
|
||||
|
||||
typedef struct MorphOpaque
|
||||
{
|
||||
Oid cfg_id;
|
||||
int qoperator; /* query operator */
|
||||
} MorphOpaque;
|
||||
|
||||
|
||||
Datum
|
||||
get_current_ts_config(PG_FUNCTION_ARGS)
|
||||
{
|
||||
@ -262,60 +269,81 @@ to_tsvector(PG_FUNCTION_ARGS)
|
||||
* to the stack.
|
||||
*
|
||||
* All words belonging to the same variant are pushed as an ANDed list,
|
||||
* and different variants are ORred together.
|
||||
* and different variants are ORed together.
|
||||
*/
|
||||
static void
|
||||
pushval_morph(Datum opaque, TSQueryParserState state, char *strval, int lenval, int16 weight, bool prefix)
|
||||
{
|
||||
int32 count = 0;
|
||||
ParsedText prs;
|
||||
uint32 variant,
|
||||
pos,
|
||||
cntvar = 0,
|
||||
cntpos = 0,
|
||||
cnt = 0;
|
||||
Oid cfg_id = DatumGetObjectId(opaque); /* the input is actually
|
||||
* an Oid, not a pointer */
|
||||
int32 count = 0;
|
||||
ParsedText prs;
|
||||
uint32 variant,
|
||||
pos = 0,
|
||||
cntvar = 0,
|
||||
cntpos = 0,
|
||||
cnt = 0;
|
||||
MorphOpaque *data = (MorphOpaque *) DatumGetPointer(opaque);
|
||||
|
||||
prs.lenwords = 4;
|
||||
prs.curwords = 0;
|
||||
prs.pos = 0;
|
||||
prs.words = (ParsedWord *) palloc(sizeof(ParsedWord) * prs.lenwords);
|
||||
|
||||
parsetext(cfg_id, &prs, strval, lenval);
|
||||
parsetext(data->cfg_id, &prs, strval, lenval);
|
||||
|
||||
if (prs.curwords > 0)
|
||||
{
|
||||
|
||||
while (count < prs.curwords)
|
||||
{
|
||||
pos = prs.words[count].pos.pos;
|
||||
/*
|
||||
* Were any stop words removed? If so, fill empty positions
|
||||
* with placeholders linked by an appropriate operator.
|
||||
*/
|
||||
if (pos > 0 && pos + 1 < prs.words[count].pos.pos)
|
||||
{
|
||||
while (pos + 1 < prs.words[count].pos.pos)
|
||||
{
|
||||
/* put placeholders for each missing stop word */
|
||||
pushStop(state);
|
||||
if (cntpos)
|
||||
pushOperator(state, data->qoperator, 1);
|
||||
cntpos++;
|
||||
pos++;
|
||||
}
|
||||
}
|
||||
|
||||
pos = prs.words[count].pos.pos; /* save current word's position */
|
||||
|
||||
/* Go through all variants obtained from this token */
|
||||
cntvar = 0;
|
||||
while (count < prs.curwords && pos == prs.words[count].pos.pos)
|
||||
{
|
||||
variant = prs.words[count].nvariant;
|
||||
|
||||
/* Push all words belonging to the same variant */
|
||||
cnt = 0;
|
||||
while (count < prs.curwords && pos == prs.words[count].pos.pos && variant == prs.words[count].nvariant)
|
||||
while (count < prs.curwords &&
|
||||
pos == prs.words[count].pos.pos &&
|
||||
variant == prs.words[count].nvariant)
|
||||
{
|
||||
|
||||
pushValue(state, prs.words[count].word, prs.words[count].len, weight,
|
||||
((prs.words[count].flags & TSL_PREFIX) || prefix) ? true : false);
|
||||
pushValue(state,
|
||||
prs.words[count].word,
|
||||
prs.words[count].len,
|
||||
weight,
|
||||
((prs.words[count].flags & TSL_PREFIX) || prefix));
|
||||
pfree(prs.words[count].word);
|
||||
if (cnt)
|
||||
pushOperator(state, OP_AND);
|
||||
pushOperator(state, OP_AND, 0);
|
||||
cnt++;
|
||||
count++;
|
||||
}
|
||||
|
||||
if (cntvar)
|
||||
pushOperator(state, OP_OR);
|
||||
pushOperator(state, OP_OR, 0);
|
||||
cntvar++;
|
||||
}
|
||||
|
||||
if (cntpos)
|
||||
pushOperator(state, OP_AND);
|
||||
|
||||
pushOperator(state, data->qoperator, 1); /* distance may be useful */
|
||||
cntpos++;
|
||||
}
|
||||
|
||||
@ -329,44 +357,18 @@ pushval_morph(Datum opaque, TSQueryParserState state, char *strval, int lenval,
|
||||
Datum
|
||||
to_tsquery_byid(PG_FUNCTION_ARGS)
|
||||
{
|
||||
Oid cfgid = PG_GETARG_OID(0);
|
||||
text *in = PG_GETARG_TEXT_P(1);
|
||||
TSQuery query;
|
||||
QueryItem *res;
|
||||
int32 len;
|
||||
text *in = PG_GETARG_TEXT_P(1);
|
||||
TSQuery query;
|
||||
MorphOpaque data;
|
||||
|
||||
query = parse_tsquery(text_to_cstring(in), pushval_morph, ObjectIdGetDatum(cfgid), false);
|
||||
data.cfg_id = PG_GETARG_OID(0);
|
||||
data.qoperator = OP_AND;
|
||||
|
||||
if (query->size == 0)
|
||||
PG_RETURN_TSQUERY(query);
|
||||
query = parse_tsquery(text_to_cstring(in),
|
||||
pushval_morph,
|
||||
PointerGetDatum(&data),
|
||||
false);
|
||||
|
||||
/* clean out any stopword placeholders from the tree */
|
||||
res = clean_fakeval(GETQUERY(query), &len);
|
||||
if (!res)
|
||||
{
|
||||
SET_VARSIZE(query, HDRSIZETQ);
|
||||
query->size = 0;
|
||||
PG_RETURN_POINTER(query);
|
||||
}
|
||||
memcpy((void *) GETQUERY(query), (void *) res, len * sizeof(QueryItem));
|
||||
|
||||
/*
|
||||
* Removing the stopword placeholders might've resulted in fewer
|
||||
* QueryItems. If so, move the operands up accordingly.
|
||||
*/
|
||||
if (len != query->size)
|
||||
{
|
||||
char *oldoperand = GETOPERAND(query);
|
||||
int32 lenoperand = VARSIZE(query) - (oldoperand - (char *) query);
|
||||
|
||||
Assert(len < query->size);
|
||||
|
||||
query->size = len;
|
||||
memmove((void *) GETOPERAND(query), oldoperand, VARSIZE(query) - (oldoperand - (char *) query));
|
||||
SET_VARSIZE(query, COMPUTESIZE(len, lenoperand));
|
||||
}
|
||||
|
||||
pfree(res);
|
||||
PG_RETURN_TSQUERY(query);
|
||||
}
|
||||
|
||||
@ -385,44 +387,18 @@ to_tsquery(PG_FUNCTION_ARGS)
|
||||
Datum
|
||||
plainto_tsquery_byid(PG_FUNCTION_ARGS)
|
||||
{
|
||||
Oid cfgid = PG_GETARG_OID(0);
|
||||
text *in = PG_GETARG_TEXT_P(1);
|
||||
TSQuery query;
|
||||
QueryItem *res;
|
||||
int32 len;
|
||||
text *in = PG_GETARG_TEXT_P(1);
|
||||
TSQuery query;
|
||||
MorphOpaque data;
|
||||
|
||||
query = parse_tsquery(text_to_cstring(in), pushval_morph, ObjectIdGetDatum(cfgid), true);
|
||||
data.cfg_id = PG_GETARG_OID(0);
|
||||
data.qoperator = OP_AND;
|
||||
|
||||
if (query->size == 0)
|
||||
PG_RETURN_TSQUERY(query);
|
||||
query = parse_tsquery(text_to_cstring(in),
|
||||
pushval_morph,
|
||||
PointerGetDatum(&data),
|
||||
true);
|
||||
|
||||
/* clean out any stopword placeholders from the tree */
|
||||
res = clean_fakeval(GETQUERY(query), &len);
|
||||
if (!res)
|
||||
{
|
||||
SET_VARSIZE(query, HDRSIZETQ);
|
||||
query->size = 0;
|
||||
PG_RETURN_POINTER(query);
|
||||
}
|
||||
memcpy((void *) GETQUERY(query), (void *) res, len * sizeof(QueryItem));
|
||||
|
||||
/*
|
||||
* Removing the stopword placeholders might've resulted in fewer
|
||||
* QueryItems. If so, move the operands up accordingly.
|
||||
*/
|
||||
if (len != query->size)
|
||||
{
|
||||
char *oldoperand = GETOPERAND(query);
|
||||
int32 lenoperand = VARSIZE(query) - (oldoperand - (char *) query);
|
||||
|
||||
Assert(len < query->size);
|
||||
|
||||
query->size = len;
|
||||
memmove((void *) GETOPERAND(query), oldoperand, lenoperand);
|
||||
SET_VARSIZE(query, COMPUTESIZE(len, lenoperand));
|
||||
}
|
||||
|
||||
pfree(res);
|
||||
PG_RETURN_POINTER(query);
|
||||
}
|
||||
|
||||
@ -437,3 +413,34 @@ plainto_tsquery(PG_FUNCTION_ARGS)
|
||||
ObjectIdGetDatum(cfgId),
|
||||
PointerGetDatum(in)));
|
||||
}
|
||||
|
||||
|
||||
Datum
|
||||
phraseto_tsquery_byid(PG_FUNCTION_ARGS)
|
||||
{
|
||||
text *in = PG_GETARG_TEXT_P(1);
|
||||
TSQuery query;
|
||||
MorphOpaque data;
|
||||
|
||||
data.cfg_id = PG_GETARG_OID(0);
|
||||
data.qoperator = OP_PHRASE;
|
||||
|
||||
query = parse_tsquery(text_to_cstring(in),
|
||||
pushval_morph,
|
||||
PointerGetDatum(&data),
|
||||
true);
|
||||
|
||||
PG_RETURN_TSQUERY(query);
|
||||
}
|
||||
|
||||
Datum
|
||||
phraseto_tsquery(PG_FUNCTION_ARGS)
|
||||
{
|
||||
text *in = PG_GETARG_TEXT_P(0);
|
||||
Oid cfgId;
|
||||
|
||||
cfgId = getTSCurrentConfig(true);
|
||||
PG_RETURN_DATUM(DirectFunctionCall2(phraseto_tsquery_byid,
|
||||
ObjectIdGetDatum(cfgId),
|
||||
PointerGetDatum(in)));
|
||||
}
|
||||
|
@ -454,7 +454,7 @@ hladdword(HeadlineParsedText *prs, char *buf, int buflen, int type)
|
||||
}
|
||||
|
||||
static void
|
||||
hlfinditem(HeadlineParsedText *prs, TSQuery query, char *buf, int buflen)
|
||||
hlfinditem(HeadlineParsedText *prs, TSQuery query, int32 pos, char *buf, int buflen)
|
||||
{
|
||||
int i;
|
||||
QueryItem *item = GETQUERY(query);
|
||||
@ -467,6 +467,7 @@ hlfinditem(HeadlineParsedText *prs, TSQuery query, char *buf, int buflen)
|
||||
}
|
||||
|
||||
word = &(prs->words[prs->curwords - 1]);
|
||||
word->pos = LIMITPOS(pos);
|
||||
for (i = 0; i < query->size; i++)
|
||||
{
|
||||
if (item->type == QI_VAL &&
|
||||
@ -492,17 +493,20 @@ addHLParsedLex(HeadlineParsedText *prs, TSQuery query, ParsedLex *lexs, TSLexeme
|
||||
{
|
||||
ParsedLex *tmplexs;
|
||||
TSLexeme *ptr;
|
||||
int32 savedpos;
|
||||
|
||||
while (lexs)
|
||||
{
|
||||
|
||||
if (lexs->type > 0)
|
||||
hladdword(prs, lexs->lemm, lexs->lenlemm, lexs->type);
|
||||
|
||||
ptr = norms;
|
||||
savedpos = prs->vectorpos;
|
||||
while (ptr && ptr->lexeme)
|
||||
{
|
||||
hlfinditem(prs, query, ptr->lexeme, strlen(ptr->lexeme));
|
||||
if (ptr->flags & TSL_ADDPOS)
|
||||
savedpos++;
|
||||
hlfinditem(prs, query, savedpos, ptr->lexeme, strlen(ptr->lexeme));
|
||||
ptr++;
|
||||
}
|
||||
|
||||
@ -516,6 +520,8 @@ addHLParsedLex(HeadlineParsedText *prs, TSQuery query, ParsedLex *lexs, TSLexeme
|
||||
ptr = norms;
|
||||
while (ptr->lexeme)
|
||||
{
|
||||
if (ptr->flags & TSL_ADDPOS)
|
||||
prs->vectorpos++;
|
||||
pfree(ptr->lexeme);
|
||||
ptr++;
|
||||
}
|
||||
@ -575,7 +581,10 @@ hlparsetext(Oid cfgId, HeadlineParsedText *prs, TSQuery query, char *buf, int bu
|
||||
do
|
||||
{
|
||||
if ((norms = LexizeExec(&ldata, &lexs)) != NULL)
|
||||
{
|
||||
prs->vectorpos++;
|
||||
addHLParsedLex(prs, query, lexs, norms);
|
||||
}
|
||||
else
|
||||
addHLParsedLex(prs, query, lexs, NULL);
|
||||
} while (norms);
|
||||
|
@ -261,7 +261,7 @@ mcelem_tsquery_selec(TSQuery query, Datum *mcelem, int nmcelem,
|
||||
/*
|
||||
* Traverse the tsquery in preorder, calculating selectivity as:
|
||||
*
|
||||
* selec(left_oper) * selec(right_oper) in AND nodes,
|
||||
* selec(left_oper) * selec(right_oper) in AND & PHRASE nodes,
|
||||
*
|
||||
* selec(left_oper) + selec(right_oper) -
|
||||
* selec(left_oper) * selec(right_oper) in OR nodes,
|
||||
@ -400,6 +400,7 @@ tsquery_opr_selec(QueryItem *item, char *operand,
|
||||
lookup, length, minfreq);
|
||||
break;
|
||||
|
||||
case OP_PHRASE:
|
||||
case OP_AND:
|
||||
s1 = tsquery_opr_selec(item + 1, operand,
|
||||
lookup, length, minfreq);
|
||||
|
@ -2030,15 +2030,36 @@ typedef struct
|
||||
} hlCheck;
|
||||
|
||||
static bool
|
||||
checkcondition_HL(void *checkval, QueryOperand *val)
|
||||
checkcondition_HL(void *opaque, QueryOperand *val, ExecPhraseData *data)
|
||||
{
|
||||
int i;
|
||||
hlCheck *checkval = (hlCheck *) opaque;
|
||||
|
||||
for (i = 0; i < ((hlCheck *) checkval)->len; i++)
|
||||
for (i = 0; i < checkval->len; i++)
|
||||
{
|
||||
if (((hlCheck *) checkval)->words[i].item == val)
|
||||
return true;
|
||||
if (checkval->words[i].item == val)
|
||||
{
|
||||
/* don't need to find all positions */
|
||||
if (!data)
|
||||
return true;
|
||||
|
||||
if (!data->pos)
|
||||
{
|
||||
data->pos = palloc(sizeof(WordEntryPos) * checkval->len);
|
||||
data->allocated = true;
|
||||
data->npos = 1;
|
||||
data->pos[0] = checkval->words[i].pos;
|
||||
}
|
||||
else if (data->pos[data->npos - 1] < checkval->words[i].pos)
|
||||
{
|
||||
data->pos[data->npos++] = checkval->words[i].pos;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (data && data->npos > 0)
|
||||
return true;
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
@ -2400,7 +2421,7 @@ mark_hl_words(HeadlineParsedText *prs, TSQuery query, int highlight,
|
||||
|
||||
if (poslen < bestlen && !(NOENDTOKEN(prs->words[beste].type) || prs->words[beste].len <= shortword))
|
||||
{
|
||||
/* best already finded, so try one more cover */
|
||||
/* best already found, so try one more cover */
|
||||
p++;
|
||||
continue;
|
||||
}
|
||||
|
Reference in New Issue
Block a user