1
0
mirror of https://github.com/postgres/postgres.git synced 2025-06-23 14:01:44 +03:00

Phrase full text search.

Patch introduces new text search operator (<-> or <DISTANCE>) into tsquery.
On-disk and binary in/out format of tsquery are backward compatible.
It has two side effect:
- change order for tsquery, so, users, who has a btree index over tsquery,
  should reindex it
- less number of parenthesis in tsquery output, and tsquery becomes more
  readable

Authors: Teodor Sigaev, Oleg Bartunov, Dmitry Ivanov
Reviewers: Alexander Korotkov, Artur Zakirov
This commit is contained in:
Teodor Sigaev
2016-04-07 18:44:18 +03:00
parent 015e88942a
commit bb140506df
30 changed files with 2542 additions and 450 deletions

View File

@ -56,7 +56,7 @@ struct TSQueryParserStateData
/*
* subroutine to parse the modifiers (weight and prefix flag currently)
* part, like ':1AB' of a query.
* part, like ':AB*' of a query.
*/
static char *
get_modifiers(char *buf, int16 *weight, bool *prefix)
@ -100,6 +100,94 @@ get_modifiers(char *buf, int16 *weight, bool *prefix)
return buf;
}
/*
* Parse phrase operator. The operator
* may take the following forms:
*
* a <X> b (distance is no greater than X)
* a <-> b (default distance = 1)
*
* The buffer should begin with '<' char
*/
static char *
parse_phrase_operator(char *buf, int16 *distance)
{
enum
{
PHRASE_OPEN = 0,
PHRASE_DIST,
PHRASE_CLOSE,
PHRASE_ERR,
PHRASE_FINISH
} state = PHRASE_OPEN;
char *ptr = buf;
char *endptr;
long l = 1;
while (*ptr)
{
switch(state)
{
case PHRASE_OPEN:
Assert(t_iseq(ptr, '<'));
state = PHRASE_DIST;
ptr++;
break;
case PHRASE_DIST:
if (t_iseq(ptr, '-'))
{
state = PHRASE_CLOSE;
ptr++;
break;
}
else if (!t_isdigit(ptr))
{
state = PHRASE_ERR;
break;
}
l = strtol(ptr, &endptr, 10);
if (ptr == endptr)
state = PHRASE_ERR;
else if (errno == ERANGE || l > MAXENTRYPOS)
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("distance in phrase operator should not be greater than %d",
MAXENTRYPOS)));
else
{
state = PHRASE_CLOSE;
ptr = endptr;
}
break;
case PHRASE_CLOSE:
if (t_iseq(ptr, '>'))
{
state = PHRASE_FINISH;
ptr++;
}
else
state = PHRASE_ERR;
break;
case PHRASE_FINISH:
*distance = (int16) l;
return ptr;
case PHRASE_ERR:
default:
goto err;
}
}
err:
*distance = -1;
return buf;
}
/*
* token types for parsing
*/
@ -116,8 +204,10 @@ typedef enum
/*
* get token from query string
*
* *operator is filled in with OP_* when return values is PT_OPR
* *operator is filled in with OP_* when return values is PT_OPR,
* but *weight could contain a distance value in case of phrase operator.
* *strval, *lenval and *weight are filled in when return value is PT_VAL
*
*/
static ts_tokentype
gettoken_query(TSQueryParserState state,
@ -185,13 +275,23 @@ gettoken_query(TSQueryParserState state,
(state->buf)++;
return PT_OPR;
}
if (t_iseq(state->buf, '|'))
else if (t_iseq(state->buf, '|'))
{
state->state = WAITOPERAND;
*operator = OP_OR;
(state->buf)++;
return PT_OPR;
}
else if (t_iseq(state->buf, '<'))
{
state->state = WAITOPERAND;
*operator = OP_PHRASE;
/* weight var is used as storage for distance */
state->buf = parse_phrase_operator(state->buf, weight);
if (*weight < 0)
return PT_ERR;
return PT_OPR;
}
else if (t_iseq(state->buf, ')'))
{
(state->buf)++;
@ -223,15 +323,16 @@ gettoken_query(TSQueryParserState state,
* Push an operator to state->polstr
*/
void
pushOperator(TSQueryParserState state, int8 oper)
pushOperator(TSQueryParserState state, int8 oper, int16 distance)
{
QueryOperator *tmp;
Assert(oper == OP_NOT || oper == OP_AND || oper == OP_OR);
Assert(oper == OP_NOT || oper == OP_AND || oper == OP_OR || oper == OP_PHRASE);
tmp = (QueryOperator *) palloc0(sizeof(QueryOperator));
tmp->type = QI_OPR;
tmp->oper = oper;
tmp->distance = (oper == OP_PHRASE) ? distance : 0;
/* left is filled in later with findoprnd */
state->polstr = lcons(tmp, state->polstr);
@ -330,14 +431,18 @@ makepol(TSQueryParserState state,
PushFunction pushval,
Datum opaque)
{
int8 operator = 0;
ts_tokentype type;
int lenval = 0;
char *strval = NULL;
int8 opstack[STACKDEPTH];
int lenstack = 0;
int16 weight = 0;
bool prefix;
int8 operator = 0;
ts_tokentype type;
int lenval = 0;
char *strval = NULL;
struct
{
int8 op;
int16 distance;
} opstack[STACKDEPTH];
int lenstack = 0;
int16 weight = 0;
bool prefix;
/* since this function recurses, it could be driven to stack overflow */
check_stack_depth();
@ -348,39 +453,48 @@ makepol(TSQueryParserState state,
{
case PT_VAL:
pushval(opaque, state, strval, lenval, weight, prefix);
while (lenstack && (opstack[lenstack - 1] == OP_AND ||
opstack[lenstack - 1] == OP_NOT))
while (lenstack && (opstack[lenstack - 1].op == OP_AND ||
opstack[lenstack - 1].op == OP_PHRASE ||
opstack[lenstack - 1].op == OP_NOT))
{
lenstack--;
pushOperator(state, opstack[lenstack]);
pushOperator(state,
opstack[lenstack].op,
opstack[lenstack].distance);
}
break;
case PT_OPR:
if (lenstack && operator == OP_OR)
pushOperator(state, OP_OR);
pushOperator(state, OP_OR, 0);
else
{
if (lenstack == STACKDEPTH) /* internal error */
elog(ERROR, "tsquery stack too small");
opstack[lenstack] = operator;
opstack[lenstack].op = operator;
opstack[lenstack].distance = weight;
lenstack++;
}
break;
case PT_OPEN:
makepol(state, pushval, opaque);
while (lenstack && (opstack[lenstack - 1] == OP_AND ||
opstack[lenstack - 1] == OP_NOT))
while (lenstack && (opstack[lenstack - 1].op == OP_AND ||
opstack[lenstack - 1].op == OP_PHRASE ||
opstack[lenstack - 1].op == OP_NOT))
{
lenstack--;
pushOperator(state, opstack[lenstack]);
pushOperator(state,
opstack[lenstack].op,
opstack[lenstack].distance);
}
break;
case PT_CLOSE:
while (lenstack)
{
lenstack--;
pushOperator(state, opstack[lenstack]);
pushOperator(state,
opstack[lenstack].op,
opstack[lenstack].distance);
};
return;
case PT_ERR:
@ -394,12 +508,14 @@ makepol(TSQueryParserState state,
while (lenstack)
{
lenstack--;
pushOperator(state, opstack[lenstack]);
pushOperator(state,
opstack[lenstack].op,
opstack[lenstack].distance);
}
}
static void
findoprnd_recurse(QueryItem *ptr, uint32 *pos, int nnodes)
findoprnd_recurse(QueryItem *ptr, uint32 *pos, int nnodes, bool *needcleanup)
{
/* since this function recurses, it could be driven to stack overflow. */
check_stack_depth();
@ -407,33 +523,47 @@ findoprnd_recurse(QueryItem *ptr, uint32 *pos, int nnodes)
if (*pos >= nnodes)
elog(ERROR, "malformed tsquery: operand not found");
if (ptr[*pos].type == QI_VAL ||
ptr[*pos].type == QI_VALSTOP) /* need to handle VALSTOP here, they
* haven't been cleaned away yet. */
if (ptr[*pos].type == QI_VAL)
{
(*pos)++;
}
else if (ptr[*pos].type == QI_VALSTOP)
{
*needcleanup = true; /* we'll have to remove stop words */
(*pos)++;
}
else
{
Assert(ptr[*pos].type == QI_OPR);
if (ptr[*pos].qoperator.oper == OP_NOT)
{
ptr[*pos].qoperator.left = 1;
ptr[*pos].qoperator.left = 1; /* fixed offset */
(*pos)++;
findoprnd_recurse(ptr, pos, nnodes);
/* process the only argument */
findoprnd_recurse(ptr, pos, nnodes, needcleanup);
}
else
{
QueryOperator *curitem = &ptr[*pos].qoperator;
int tmp = *pos;
QueryOperator *curitem = &ptr[*pos].qoperator;
int tmp = *pos; /* save current position */
Assert(curitem->oper == OP_AND || curitem->oper == OP_OR);
Assert(curitem->oper == OP_AND ||
curitem->oper == OP_OR ||
curitem->oper == OP_PHRASE);
if (curitem->oper == OP_PHRASE)
*needcleanup = true; /* push OP_PHRASE down later */
(*pos)++;
findoprnd_recurse(ptr, pos, nnodes);
curitem->left = *pos - tmp;
findoprnd_recurse(ptr, pos, nnodes);
/* process RIGHT argument */
findoprnd_recurse(ptr, pos, nnodes, needcleanup);
curitem->left = *pos - tmp; /* set LEFT arg's offset */
/* process LEFT argument */
findoprnd_recurse(ptr, pos, nnodes, needcleanup);
}
}
}
@ -444,12 +574,13 @@ findoprnd_recurse(QueryItem *ptr, uint32 *pos, int nnodes)
* QueryItems must be in polish (prefix) notation.
*/
static void
findoprnd(QueryItem *ptr, int size)
findoprnd(QueryItem *ptr, int size, bool *needcleanup)
{
uint32 pos;
*needcleanup = false;
pos = 0;
findoprnd_recurse(ptr, &pos, size);
findoprnd_recurse(ptr, &pos, size, needcleanup);
if (pos != size)
elog(ERROR, "malformed tsquery: extra nodes");
@ -466,9 +597,6 @@ findoprnd(QueryItem *ptr, int size)
*
* opaque is passed on to pushval as is, pushval can use it to store its
* private state.
*
* The returned query might contain QI_STOPVAL nodes. The caller is responsible
* for cleaning them up (with clean_fakeval)
*/
TSQuery
parse_tsquery(char *buf,
@ -482,6 +610,7 @@ parse_tsquery(char *buf,
int commonlen;
QueryItem *ptr;
ListCell *cell;
bool needcleanup;
/* init state */
state.buffer = buf;
@ -531,7 +660,7 @@ parse_tsquery(char *buf,
i = 0;
foreach(cell, state.polstr)
{
QueryItem *item = (QueryItem *) lfirst(cell);
QueryItem *item = (QueryItem *) lfirst(cell);
switch (item->type)
{
@ -555,7 +684,14 @@ parse_tsquery(char *buf,
pfree(state.op);
/* Set left operand pointers for every operator. */
findoprnd(ptr, query->size);
findoprnd(ptr, query->size, &needcleanup);
/*
* QI_VALSTOP nodes should be cleaned and
* and OP_PHRASE should be pushed down
*/
if (needcleanup)
return cleanup_fakeval_and_phrase(query);
return query;
}
@ -600,12 +736,15 @@ while( ( (inf)->cur - (inf)->buf ) + (addsize) + 1 >= (inf)->buflen ) \
(inf)->cur = (inf)->buf + len; \
}
#define PRINT_PRIORITY(x) \
( (QO_PRIORITY(x) == OP_NOT) ? OP_NOT_PHRASE : QO_PRIORITY(x) )
/*
* recursive walk on tree and print it in
* infix (human-readable) view
* recursively traverse the tree and
* print it in infix (human-readable) form
*/
static void
infix(INFIX *in, bool first)
infix(INFIX *in, int parentPriority)
{
/* since this function recurses, it could be driven to stack overflow. */
check_stack_depth();
@ -674,24 +813,22 @@ infix(INFIX *in, bool first)
}
else if (in->curpol->qoperator.oper == OP_NOT)
{
bool isopr = false;
int priority = PRINT_PRIORITY(in->curpol);
if (priority < parentPriority)
{
RESIZEBUF(in, 2);
sprintf(in->cur, "( ");
in->cur = strchr(in->cur, '\0');
}
RESIZEBUF(in, 1);
*(in->cur) = '!';
in->cur++;
*(in->cur) = '\0';
in->curpol++;
if (in->curpol->type == QI_OPR)
{
isopr = true;
RESIZEBUF(in, 2);
sprintf(in->cur, "( ");
in->cur = strchr(in->cur, '\0');
}
infix(in, isopr);
if (isopr)
infix(in, priority);
if (priority < parentPriority)
{
RESIZEBUF(in, 2);
sprintf(in->cur, " )");
@ -701,11 +838,18 @@ infix(INFIX *in, bool first)
else
{
int8 op = in->curpol->qoperator.oper;
int priority = PRINT_PRIORITY(in->curpol);
int16 distance = in->curpol->qoperator.distance;
INFIX nrm;
bool needParenthesis = false;
in->curpol++;
if (op == OP_OR && !first)
if (priority < parentPriority ||
(op == OP_PHRASE &&
(priority == parentPriority || /* phrases are not commutative! */
parentPriority == OP_PRIORITY(OP_AND))))
{
needParenthesis = true;
RESIZEBUF(in, 2);
sprintf(in->cur, "( ");
in->cur = strchr(in->cur, '\0');
@ -717,14 +861,14 @@ infix(INFIX *in, bool first)
nrm.cur = nrm.buf = (char *) palloc(sizeof(char) * nrm.buflen);
/* get right operand */
infix(&nrm, false);
infix(&nrm, priority);
/* get & print left operand */
in->curpol = nrm.curpol;
infix(in, false);
infix(in, priority);
/* print operator & right operand */
RESIZEBUF(in, 3 + (nrm.cur - nrm.buf));
RESIZEBUF(in, 3 + (2 + 10 /* distance */) + (nrm.cur - nrm.buf));
switch (op)
{
case OP_OR:
@ -733,6 +877,12 @@ infix(INFIX *in, bool first)
case OP_AND:
sprintf(in->cur, " & %s", nrm.buf);
break;
case OP_PHRASE:
if (distance != 1)
sprintf(in->cur, " <%d> %s", distance, nrm.buf);
else
sprintf(in->cur, " <-> %s", nrm.buf);
break;
default:
/* OP_NOT is handled in above if-branch */
elog(ERROR, "unrecognized operator type: %d", op);
@ -740,7 +890,7 @@ infix(INFIX *in, bool first)
in->cur = strchr(in->cur, '\0');
pfree(nrm.buf);
if (op == OP_OR && !first)
if (needParenthesis)
{
RESIZEBUF(in, 2);
sprintf(in->cur, " )");
@ -749,7 +899,6 @@ infix(INFIX *in, bool first)
}
}
Datum
tsqueryout(PG_FUNCTION_ARGS)
{
@ -768,7 +917,7 @@ tsqueryout(PG_FUNCTION_ARGS)
nrm.cur = nrm.buf = (char *) palloc(sizeof(char) * nrm.buflen);
*(nrm.cur) = '\0';
nrm.op = GETOPERAND(query);
infix(&nrm, true);
infix(&nrm, -1 /* lowest priority */);
PG_FREE_IF_COPY(query, 0);
PG_RETURN_CSTRING(nrm.buf);
@ -789,7 +938,8 @@ tsqueryout(PG_FUNCTION_ARGS)
*
* For each operator:
* uint8 type, QI_OPR
* uint8 operator, one of OP_AND, OP_OR, OP_NOT.
* uint8 operator, one of OP_AND, OP_PHRASE OP_OR, OP_NOT.
* uint16 distance (only for OP_PHRASE)
*/
Datum
tsquerysend(PG_FUNCTION_ARGS)
@ -815,6 +965,9 @@ tsquerysend(PG_FUNCTION_ARGS)
break;
case QI_OPR:
pq_sendint(&buf, item->qoperator.oper, sizeof(item->qoperator.oper));
if (item->qoperator.oper == OP_PHRASE)
pq_sendint(&buf, item->qoperator.distance,
sizeof(item->qoperator.distance));
break;
default:
elog(ERROR, "unrecognized tsquery node type: %d", item->type);
@ -830,15 +983,16 @@ tsquerysend(PG_FUNCTION_ARGS)
Datum
tsqueryrecv(PG_FUNCTION_ARGS)
{
StringInfo buf = (StringInfo) PG_GETARG_POINTER(0);
TSQuery query;
int i,
len;
QueryItem *item;
int datalen;
char *ptr;
uint32 size;
const char **operands;
StringInfo buf = (StringInfo) PG_GETARG_POINTER(0);
TSQuery query;
int i,
len;
QueryItem *item;
int datalen;
char *ptr;
uint32 size;
const char **operands;
bool needcleanup;
size = pq_getmsgint(buf, sizeof(uint32));
if (size > (MaxAllocSize / sizeof(QueryItem)))
@ -907,13 +1061,15 @@ tsqueryrecv(PG_FUNCTION_ARGS)
int8 oper;
oper = (int8) pq_getmsgint(buf, sizeof(int8));
if (oper != OP_NOT && oper != OP_OR && oper != OP_AND)
if (oper != OP_NOT && oper != OP_OR && oper != OP_AND && oper != OP_PHRASE)
elog(ERROR, "invalid tsquery: unrecognized operator type %d",
(int) oper);
if (i == size - 1)
elog(ERROR, "invalid pointer to right operand");
item->qoperator.oper = oper;
if (oper == OP_PHRASE)
item->qoperator.distance = (int16) pq_getmsgint(buf, sizeof(int16));
}
else
elog(ERROR, "unrecognized tsquery node type: %d", item->type);
@ -930,7 +1086,7 @@ tsqueryrecv(PG_FUNCTION_ARGS)
* Fill in the left-pointers. Checks that the tree is well-formed as a
* side-effect.
*/
findoprnd(item, size);
findoprnd(item, size, &needcleanup);
/* Copy operands to output struct */
for (i = 0; i < size; i++)
@ -949,7 +1105,10 @@ tsqueryrecv(PG_FUNCTION_ARGS)
SET_VARSIZE(query, len + datalen);
PG_RETURN_TSVECTOR(query);
if (needcleanup)
PG_RETURN_TSQUERY(cleanup_fakeval_and_phrase(query));
PG_RETURN_TSQUERY(query);
}
/*