1
0
mirror of https://github.com/postgres/postgres.git synced 2026-01-05 23:38:41 +03:00

Phrase full text search.

Patch introduces new text search operator (<-> or <DISTANCE>) into tsquery.
On-disk and binary in/out format of tsquery are backward compatible.
It has two side effect:
- change order for tsquery, so, users, who has a btree index over tsquery,
  should reindex it
- less number of parenthesis in tsquery output, and tsquery becomes more
  readable

Authors: Teodor Sigaev, Oleg Bartunov, Dmitry Ivanov
Reviewers: Alexander Korotkov, Artur Zakirov
This commit is contained in:
Teodor Sigaev
2016-04-07 18:44:18 +03:00
parent 015e88942a
commit bb140506df
30 changed files with 2542 additions and 450 deletions

View File

@@ -364,8 +364,10 @@ calc_rank(const float *w, TSVector t, TSQuery q, int32 method)
return 0.0;
/* XXX: What about NOT? */
res = (item->type == QI_OPR && item->qoperator.oper == OP_AND) ?
calc_rank_and(w, t, q) : calc_rank_or(w, t, q);
res = (item->type == QI_OPR && (item->qoperator.oper == OP_AND ||
item->qoperator.oper == OP_PHRASE)) ?
calc_rank_and(w, t, q) :
calc_rank_or(w, t, q);
if (res < 0)
res = 1e-20f;
@@ -496,10 +498,17 @@ ts_rank_tt(PG_FUNCTION_ARGS)
typedef struct
{
QueryItem **item;
int16 nitem;
uint8 wclass;
int32 pos;
union {
struct { /* compiled doc representation */
QueryItem **items;
int16 nitem;
} query;
struct { /* struct is used for preparing doc representation */
QueryItem *item;
WordEntry *entry;
} map;
} data;
WordEntryPos pos;
} DocRepresentation;
static int
@@ -508,26 +517,59 @@ compareDocR(const void *va, const void *vb)
const DocRepresentation *a = (const DocRepresentation *) va;
const DocRepresentation *b = (const DocRepresentation *) vb;
if (a->pos == b->pos)
return 0;
return (a->pos > b->pos) ? 1 : -1;
if (WEP_GETPOS(a->pos) == WEP_GETPOS(b->pos))
{
if (WEP_GETWEIGHT(a->pos) == WEP_GETWEIGHT(b->pos))
{
if (a->data.map.entry == b->data.map.entry)
return 0;
return (a->data.map.entry > b->data.map.entry) ? 1 : -1;
}
return (WEP_GETWEIGHT(a->pos) > WEP_GETWEIGHT(b->pos)) ? 1 : -1;
}
return (WEP_GETPOS(a->pos) > WEP_GETPOS(b->pos)) ? 1 : -1;
}
#define MAXQROPOS MAXENTRYPOS
typedef struct
{
bool operandexists;
bool reverseinsert; /* indicates insert order,
true means descending order */
uint32 npos;
WordEntryPos pos[MAXQROPOS];
} QueryRepresentationOperand;
typedef struct
{
TSQuery query;
bool *operandexist;
TSQuery query;
QueryRepresentationOperand *operandData;
} QueryRepresentation;
#define QR_GET_OPERAND_EXISTS(q, v) ( (q)->operandexist[ ((QueryItem*)(v)) - GETQUERY((q)->query) ] )
#define QR_SET_OPERAND_EXISTS(q, v) QR_GET_OPERAND_EXISTS(q,v) = true
#define QR_GET_OPERAND_DATA(q, v) \
( (q)->operandData + (((QueryItem*)(v)) - GETQUERY((q)->query)) )
static bool
checkcondition_QueryOperand(void *checkval, QueryOperand *val)
checkcondition_QueryOperand(void *checkval, QueryOperand *val, ExecPhraseData *data)
{
QueryRepresentation *qr = (QueryRepresentation *) checkval;
QueryRepresentation *qr = (QueryRepresentation *) checkval;
QueryRepresentationOperand *opData = QR_GET_OPERAND_DATA(qr, val);
return QR_GET_OPERAND_EXISTS(qr, val);
if (!opData->operandexists)
return false;
if (data)
{
data->npos = opData->npos;
data->pos = opData->pos;
if (opData->reverseinsert)
data->pos += MAXQROPOS - opData->npos;
}
return true;
}
typedef struct
@@ -539,14 +581,65 @@ typedef struct
DocRepresentation *end;
} CoverExt;
static void
resetQueryRepresentation(QueryRepresentation *qr, bool reverseinsert)
{
int i;
for(i = 0; i < qr->query->size; i++)
{
qr->operandData[i].operandexists = false;
qr->operandData[i].reverseinsert = reverseinsert;
qr->operandData[i].npos = 0;
}
}
static void
fillQueryRepresentationData(QueryRepresentation *qr, DocRepresentation *entry)
{
int i;
int lastPos;
QueryRepresentationOperand *opData;
for (i = 0; i < entry->data.query.nitem; i++)
{
if (entry->data.query.items[i]->type != QI_VAL)
continue;
opData = QR_GET_OPERAND_DATA(qr, entry->data.query.items[i]);
opData->operandexists = true;
if (opData->npos == 0)
{
lastPos = (opData->reverseinsert) ? (MAXQROPOS - 1) : 0;
opData->pos[lastPos] = entry->pos;
opData->npos++;
continue;
}
lastPos = opData->reverseinsert ?
(MAXQROPOS - opData->npos) :
(opData->npos - 1);
if (WEP_GETPOS(opData->pos[lastPos]) != WEP_GETPOS(entry->pos))
{
lastPos = opData->reverseinsert ?
(MAXQROPOS - 1 - opData->npos) :
(opData->npos);
opData->pos[lastPos] = entry->pos;
opData->npos++;
}
}
}
static bool
Cover(DocRepresentation *doc, int len, QueryRepresentation *qr, CoverExt *ext)
{
DocRepresentation *ptr;
int lastpos = ext->pos;
int i;
bool found = false;
DocRepresentation *ptr;
int lastpos = ext->pos;
bool found = false;
/*
* since this function recurses, it could be driven to stack overflow.
@@ -554,7 +647,7 @@ Cover(DocRepresentation *doc, int len, QueryRepresentation *qr, CoverExt *ext)
*/
check_stack_depth();
memset(qr->operandexist, 0, sizeof(bool) * qr->query->size);
resetQueryRepresentation(qr, false);
ext->p = INT_MAX;
ext->q = 0;
@@ -563,16 +656,13 @@ Cover(DocRepresentation *doc, int len, QueryRepresentation *qr, CoverExt *ext)
/* find upper bound of cover from current position, move up */
while (ptr - doc < len)
{
for (i = 0; i < ptr->nitem; i++)
{
if (ptr->item[i]->type == QI_VAL)
QR_SET_OPERAND_EXISTS(qr, ptr->item[i]);
}
fillQueryRepresentationData(qr, ptr);
if (TS_execute(GETQUERY(qr->query), (void *) qr, false, checkcondition_QueryOperand))
{
if (ptr->pos > ext->q)
if (WEP_GETPOS(ptr->pos) > ext->q)
{
ext->q = ptr->pos;
ext->q = WEP_GETPOS(ptr->pos);
ext->end = ptr;
lastpos = ptr - doc;
found = true;
@@ -585,22 +675,24 @@ Cover(DocRepresentation *doc, int len, QueryRepresentation *qr, CoverExt *ext)
if (!found)
return false;
memset(qr->operandexist, 0, sizeof(bool) * qr->query->size);
resetQueryRepresentation(qr, true);
ptr = doc + lastpos;
/* find lower bound of cover from found upper bound, move down */
while (ptr >= doc + ext->pos)
{
for (i = 0; i < ptr->nitem; i++)
if (ptr->item[i]->type == QI_VAL)
QR_SET_OPERAND_EXISTS(qr, ptr->item[i]);
/*
* we scan doc from right to left, so pos info in reverse order!
*/
fillQueryRepresentationData(qr, ptr);
if (TS_execute(GETQUERY(qr->query), (void *) qr, true, checkcondition_QueryOperand))
{
if (ptr->pos < ext->p)
if (WEP_GETPOS(ptr->pos) < ext->p)
{
ext->begin = ptr;
ext->p = ptr->pos;
ext->p = WEP_GETPOS(ptr->pos);
}
break;
}
@@ -628,18 +720,20 @@ get_docrep(TSVector txt, QueryRepresentation *qr, int *doclen)
WordEntry *entry,
*firstentry;
WordEntryPos *post;
int32 dimt,
int32 dimt, /* number of 'post' items */
j,
i,
nitem;
int len = qr->query->size * 4,
cur = 0;
DocRepresentation *doc;
char *operand;
doc = (DocRepresentation *) palloc(sizeof(DocRepresentation) * len);
operand = GETOPERAND(qr->query);
/*
* Iterate through query to make DocRepresentaion for words and it's entries
* satisfied by query
*/
for (i = 0; i < qr->query->size; i++)
{
QueryOperand *curoperand;
@@ -649,13 +743,11 @@ get_docrep(TSVector txt, QueryRepresentation *qr, int *doclen)
curoperand = &item[i].qoperand;
if (QR_GET_OPERAND_EXISTS(qr, &item[i]))
continue;
firstentry = entry = find_wordentry(txt, qr->query, curoperand, &nitem);
if (!entry)
continue;
/* iterations over entries in tsvector */
while (entry - firstentry < nitem)
{
if (entry->haspos)
@@ -676,53 +768,67 @@ get_docrep(TSVector txt, QueryRepresentation *qr, int *doclen)
doc = (DocRepresentation *) repalloc(doc, sizeof(DocRepresentation) * len);
}
/* iterations over entry's positions */
for (j = 0; j < dimt; j++)
{
if (j == 0)
if (curoperand->weight == 0 ||
curoperand->weight & (1 << WEP_GETWEIGHT(post[j])))
{
int k;
doc[cur].nitem = 0;
doc[cur].item = (QueryItem **) palloc(sizeof(QueryItem *) * qr->query->size);
for (k = 0; k < qr->query->size; k++)
{
QueryOperand *kptr = &item[k].qoperand;
QueryOperand *iptr = &item[i].qoperand;
if (k == i ||
(item[k].type == QI_VAL &&
compareQueryOperand(&kptr, &iptr, operand) == 0))
{
/*
* if k == i, we've already checked above that
* it's type == Q_VAL
*/
doc[cur].item[doc[cur].nitem] = item + k;
doc[cur].nitem++;
QR_SET_OPERAND_EXISTS(qr, item + k);
}
}
doc[cur].pos = post[j];
doc[cur].data.map.entry = entry;
doc[cur].data.map.item = (QueryItem *) curoperand;
cur++;
}
else
{
doc[cur].nitem = doc[cur - 1].nitem;
doc[cur].item = doc[cur - 1].item;
}
doc[cur].pos = WEP_GETPOS(post[j]);
doc[cur].wclass = WEP_GETWEIGHT(post[j]);
cur++;
}
entry++;
}
}
*doclen = cur;
if (cur > 0)
{
DocRepresentation *rptr = doc + 1,
*wptr = doc,
storage;
/*
* Sort representation in ascending order by pos and entry
*/
qsort((void *) doc, cur, sizeof(DocRepresentation), compareDocR);
/*
* Join QueryItem per WordEntry and it's position
*/
storage.pos = doc->pos;
storage.data.query.items = palloc(sizeof(QueryItem *) * qr->query->size);
storage.data.query.items[0] = doc->data.map.item;
storage.data.query.nitem = 1;
while (rptr - doc < cur)
{
if (rptr->pos == (rptr-1)->pos &&
rptr->data.map.entry == (rptr-1)->data.map.entry)
{
storage.data.query.items[storage.data.query.nitem] = rptr->data.map.item;
storage.data.query.nitem++;
}
else
{
*wptr = storage;
wptr++;
storage.pos = rptr->pos;
storage.data.query.items = palloc(sizeof(QueryItem *) * qr->query->size);
storage.data.query.items[0] = rptr->data.map.item;
storage.data.query.nitem = 1;
}
rptr++;
}
*wptr = storage;
wptr++;
*doclen = wptr - doc;
return doc;
}
@@ -758,12 +864,13 @@ calc_rank_cd(const float4 *arrdata, TSVector txt, TSQuery query, int method)
}
qr.query = query;
qr.operandexist = (bool *) palloc0(sizeof(bool) * query->size);
qr.operandData = (QueryRepresentationOperand *)
palloc0(sizeof(QueryRepresentationOperand) * query->size);
doc = get_docrep(txt, &qr, &doclen);
if (!doc)
{
pfree(qr.operandexist);
pfree(qr.operandData);
return 0.0;
}
@@ -777,7 +884,7 @@ calc_rank_cd(const float4 *arrdata, TSVector txt, TSQuery query, int method)
while (ptr <= ext.end)
{
InvSum += invws[ptr->wclass];
InvSum += invws[WEP_GETWEIGHT(ptr->pos)];
ptr++;
}
@@ -827,7 +934,7 @@ calc_rank_cd(const float4 *arrdata, TSVector txt, TSQuery query, int method)
pfree(doc);
pfree(qr.operandexist);
pfree(qr.operandData);
return (float4) Wdoc;
}