1
0
mirror of https://github.com/postgres/postgres.git synced 2025-09-09 13:09:39 +03:00

Extend GIN to support partial-match searches, and extend tsquery to support

prefix matching using this facility.

Teodor Sigaev and Oleg Bartunov
This commit is contained in:
Tom Lane
2008-05-16 16:31:02 +00:00
parent e1bdd07c3c
commit e6dbcb72fa
32 changed files with 1284 additions and 508 deletions

View File

@@ -7,7 +7,7 @@
*
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/utils/adt/tsrank.c,v 1.12 2008/01/01 19:45:53 momjian Exp $
* $PostgreSQL: pgsql/src/backend/utils/adt/tsrank.c,v 1.13 2008/05/16 16:31:01 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@@ -71,45 +71,60 @@ cnt_length(TSVector t)
return len;
}
static int
WordECompareQueryItem(char *eval, char *qval, WordEntry *ptr, QueryOperand *item)
{
if (ptr->len == item->length)
return strncmp(
eval + ptr->pos,
qval + item->distance,
item->length);
return (ptr->len > item->length) ? 1 : -1;
}
#define WordECompareQueryItem(e,q,p,i,m) \
tsCompareString((q) + (i)->distance, (i)->length, \
(e) + (p)->pos, (p)->len, (m))
/*
* Returns a pointer to a WordEntry corresponding 'item' from tsvector 't'. 'q'
* is the TSQuery containing 'item'. Returns NULL if not found.
* Returns a pointer to a WordEntry's array corresponding to 'item' from
* tsvector 't'. 'q' is the TSQuery containing 'item'.
* Returns NULL if not found.
*/
static WordEntry *
find_wordentry(TSVector t, TSQuery q, QueryOperand *item)
find_wordentry(TSVector t, TSQuery q, QueryOperand *item, int32 *nitem)
{
WordEntry *StopLow = ARRPTR(t);
WordEntry *StopHigh = (WordEntry *) STRPTR(t);
WordEntry *StopMiddle;
WordEntry *StopMiddle = StopHigh;
int difference;
/* Loop invariant: StopLow <= item < StopHigh */
*nitem=0;
/* Loop invariant: StopLow <= item < StopHigh */
while (StopLow < StopHigh)
{
StopMiddle = StopLow + (StopHigh - StopLow) / 2;
difference = WordECompareQueryItem(STRPTR(t), GETOPERAND(q), StopMiddle, item);
difference = WordECompareQueryItem(STRPTR(t), GETOPERAND(q), StopMiddle, item, false);
if (difference == 0)
return StopMiddle;
else if (difference < 0)
{
StopHigh = StopMiddle;
*nitem=1;
break;
}
else if (difference > 0)
StopLow = StopMiddle + 1;
else
StopHigh = StopMiddle;
}
return NULL;
if ( item->prefix == true )
{
if ( StopLow >= StopHigh )
StopMiddle = StopHigh;
*nitem=0;
while( StopMiddle < (WordEntry *) STRPTR(t) &&
WordECompareQueryItem(STRPTR(t), GETOPERAND(q), StopMiddle, item, true) == 0 )
{
(*nitem)++;
StopMiddle++;
}
}
return ( *nitem > 0 ) ? StopHigh : NULL;
}
@@ -123,12 +138,9 @@ compareQueryOperand(const void *a, const void *b, void *arg)
QueryOperand *qa = (*(QueryOperand **) a);
QueryOperand *qb = (*(QueryOperand **) b);
if (qa->length == qb->length)
return strncmp(operand + qa->distance,
operand + qb->distance,
qb->length);
return (qa->length > qb->length) ? 1 : -1;
return tsCompareString(operand + qa->distance, qa->length,
operand + qb->distance, qb->length,
false);
}
/*
@@ -198,12 +210,14 @@ calc_rank_and(float *w, TSVector t, TSQuery q)
k,
l,
p;
WordEntry *entry;
WordEntry *entry,
*firstentry;
WordEntryPos *post,
*ct;
int4 dimt,
lenct,
dist;
dist,
nitem;
float res = -1.0;
QueryOperand **item;
int size = q->size;
@@ -219,40 +233,44 @@ calc_rank_and(float *w, TSVector t, TSQuery q)
for (i = 0; i < size; i++)
{
entry = find_wordentry(t, q, item[i]);
firstentry = entry = find_wordentry(t, q, item[i], &nitem);
if (!entry)
continue;
if (entry->haspos)
pos[i] = _POSVECPTR(t, entry);
else
pos[i] = &POSNULL;
dimt = pos[i]->npos;
post = pos[i]->pos;
for (k = 0; k < i; k++)
while( entry - firstentry < nitem )
{
if (!pos[k])
continue;
lenct = pos[k]->npos;
ct = pos[k]->pos;
for (l = 0; l < dimt; l++)
{
for (p = 0; p < lenct; p++)
{
dist = Abs((int) WEP_GETPOS(post[l]) - (int) WEP_GETPOS(ct[p]));
if (dist || (dist == 0 && (pos[i] == &POSNULL || pos[k] == &POSNULL)))
{
float curw;
if (entry->haspos)
pos[i] = _POSVECPTR(t, entry);
else
pos[i] = &POSNULL;
if (!dist)
dist = MAXENTRYPOS;
curw = sqrt(wpos(post[l]) * wpos(ct[p]) * word_distance(dist));
res = (res < 0) ? curw : 1.0 - (1.0 - res) * (1.0 - curw);
dimt = pos[i]->npos;
post = pos[i]->pos;
for (k = 0; k < i; k++)
{
if (!pos[k])
continue;
lenct = pos[k]->npos;
ct = pos[k]->pos;
for (l = 0; l < dimt; l++)
{
for (p = 0; p < lenct; p++)
{
dist = Abs((int) WEP_GETPOS(post[l]) - (int) WEP_GETPOS(ct[p]));
if (dist || (dist == 0 && (pos[i] == &POSNULL || pos[k] == &POSNULL)))
{
float curw;
if (!dist)
dist = MAXENTRYPOS;
curw = sqrt(wpos(post[l]) * wpos(ct[p]) * word_distance(dist));
res = (res < 0) ? curw : 1.0 - (1.0 - res) * (1.0 - curw);
}
}
}
}
entry++;
}
}
pfree(pos);
@@ -263,11 +281,13 @@ calc_rank_and(float *w, TSVector t, TSQuery q)
static float
calc_rank_or(float *w, TSVector t, TSQuery q)
{
WordEntry *entry;
WordEntry *entry,
*firstentry;
WordEntryPos *post;
int4 dimt,
j,
i;
i,
nitem;
float res = 0.0;
QueryOperand **item;
int size = q->size;
@@ -280,41 +300,46 @@ calc_rank_or(float *w, TSVector t, TSQuery q)
wjm;
int4 jm;
entry = find_wordentry(t, q, item[i]);
firstentry = entry = find_wordentry(t, q, item[i], &nitem);
if (!entry)
continue;
if (entry->haspos)
while( entry - firstentry < nitem )
{
dimt = POSDATALEN(t, entry);
post = POSDATAPTR(t, entry);
}
else
{
dimt = POSNULL.npos;
post = POSNULL.pos;
}
resj = 0.0;
wjm = -1.0;
jm = 0;
for (j = 0; j < dimt; j++)
{
resj = resj + wpos(post[j]) / ((j + 1) * (j + 1));
if (wpos(post[j]) > wjm)
if (entry->haspos)
{
wjm = wpos(post[j]);
jm = j;
dimt = POSDATALEN(t, entry);
post = POSDATAPTR(t, entry);
}
else
{
dimt = POSNULL.npos;
post = POSNULL.pos;
}
resj = 0.0;
wjm = -1.0;
jm = 0;
for (j = 0; j < dimt; j++)
{
resj = resj + wpos(post[j]) / ((j + 1) * (j + 1));
if (wpos(post[j]) > wjm)
{
wjm = wpos(post[j]);
jm = j;
}
}
}
/*
limit (sum(i/i^2),i->inf) = pi^2/6
resj = sum(wi/i^2),i=1,noccurence,
wi - should be sorted desc,
don't sort for now, just choose maximum weight. This should be corrected
Oleg Bartunov
limit (sum(i/i^2),i->inf) = pi^2/6
resj = sum(wi/i^2),i=1,noccurence,
wi - should be sorted desc,
don't sort for now, just choose maximum weight. This should be corrected
Oleg Bartunov
*/
res = res + (wjm + resj - wjm / ((jm + 1) * (jm + 1))) / 1.64493406685;
res = res + (wjm + resj - wjm / ((jm + 1) * (jm + 1))) / 1.64493406685;
entry++;
}
}
if (size > 0)
res = res / size;
@@ -594,11 +619,13 @@ static DocRepresentation *
get_docrep(TSVector txt, QueryRepresentation *qr, int *doclen)
{
QueryItem *item = GETQUERY(qr->query);
WordEntry *entry;
WordEntry *entry,
*firstentry;
WordEntryPos *post;
int4 dimt,
j,
i;
i,
nitem;
int len = qr->query->size * 4,
cur = 0;
DocRepresentation *doc;
@@ -619,63 +646,68 @@ get_docrep(TSVector txt, QueryRepresentation *qr, int *doclen)
if (QR_GET_OPERAND_EXISTS(qr, &item[i]))
continue;
entry = find_wordentry(txt, qr->query, curoperand);
firstentry = entry = find_wordentry(txt, qr->query, curoperand, &nitem);
if (!entry)
continue;
if (entry->haspos)
while( entry - firstentry < nitem )
{
dimt = POSDATALEN(txt, entry);
post = POSDATAPTR(txt, entry);
}
else
{
dimt = POSNULL.npos;
post = POSNULL.pos;
}
while (cur + dimt >= len)
{
len *= 2;
doc = (DocRepresentation *) repalloc(doc, sizeof(DocRepresentation) * len);
}
for (j = 0; j < dimt; j++)
{
if (j == 0)
if (entry->haspos)
{
int k;
doc[cur].nitem = 0;
doc[cur].item = (QueryItem **) palloc(sizeof(QueryItem *) * qr->query->size);
for (k = 0; k < qr->query->size; k++)
{
QueryOperand *kptr = &item[k].operand;
QueryOperand *iptr = &item[i].operand;
if (k == i ||
(item[k].type == QI_VAL &&
compareQueryOperand(&kptr, &iptr, operand) == 0))
{
/*
* if k == i, we've already checked above that it's
* type == Q_VAL
*/
doc[cur].item[doc[cur].nitem] = item + k;
doc[cur].nitem++;
QR_SET_OPERAND_EXISTS(qr, item + k);
}
}
dimt = POSDATALEN(txt, entry);
post = POSDATAPTR(txt, entry);
}
else
{
doc[cur].nitem = doc[cur - 1].nitem;
doc[cur].item = doc[cur - 1].item;
dimt = POSNULL.npos;
post = POSNULL.pos;
}
doc[cur].pos = WEP_GETPOS(post[j]);
doc[cur].wclass = WEP_GETWEIGHT(post[j]);
cur++;
while (cur + dimt >= len)
{
len *= 2;
doc = (DocRepresentation *) repalloc(doc, sizeof(DocRepresentation) * len);
}
for (j = 0; j < dimt; j++)
{
if (j == 0)
{
int k;
doc[cur].nitem = 0;
doc[cur].item = (QueryItem **) palloc(sizeof(QueryItem *) * qr->query->size);
for (k = 0; k < qr->query->size; k++)
{
QueryOperand *kptr = &item[k].operand;
QueryOperand *iptr = &item[i].operand;
if (k == i ||
(item[k].type == QI_VAL &&
compareQueryOperand(&kptr, &iptr, operand) == 0))
{
/*
* if k == i, we've already checked above that it's
* type == Q_VAL
*/
doc[cur].item[doc[cur].nitem] = item + k;
doc[cur].nitem++;
QR_SET_OPERAND_EXISTS(qr, item + k);
}
}
}
else
{
doc[cur].nitem = doc[cur - 1].nitem;
doc[cur].item = doc[cur - 1].item;
}
doc[cur].pos = WEP_GETPOS(post[j]);
doc[cur].wclass = WEP_GETWEIGHT(post[j]);
cur++;
}
entry++;
}
}