1
0
mirror of https://github.com/postgres/postgres.git synced 2025-08-30 06:01:21 +03:00

Tsearch2 functionality migrates to core. The bulk of this work is by

Oleg Bartunov and Teodor Sigaev, but I did a lot of editorializing,
so anything that's broken is probably my fault.

Documentation is nonexistent as yet, but let's land the patch so we can
get some portability testing done.
This commit is contained in:
Tom Lane
2007-08-21 01:11:32 +00:00
parent 4e94d1f952
commit 140d4ebcb4
200 changed files with 54388 additions and 147 deletions

View File

@@ -0,0 +1,804 @@
/*-------------------------------------------------------------------------
*
* tsrank.c
* rank tsvector by tsquery
*
* Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
*
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/utils/adt/tsrank.c,v 1.1 2007/08/21 01:11:19 tgl Exp $
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include <math.h>
#include "tsearch/ts_type.h"
#include "tsearch/ts_utils.h"
#include "utils/array.h"
static float weights[] = {0.1, 0.2, 0.4, 1.0};
#define wpos(wep) ( w[ WEP_GETWEIGHT(wep) ] )
#define RANK_NO_NORM 0x00
#define RANK_NORM_LOGLENGTH 0x01
#define RANK_NORM_LENGTH 0x02
#define RANK_NORM_EXTDIST 0x04
#define RANK_NORM_UNIQ 0x08
#define RANK_NORM_LOGUNIQ 0x10
#define DEF_NORM_METHOD RANK_NO_NORM
static float calc_rank_or(float *w, TSVector t, TSQuery q);
static float calc_rank_and(float *w, TSVector t, TSQuery q);
/*
* Returns a weight of a word collocation
*/
static float4
word_distance(int4 w)
{
if (w > 100)
return 1e-30;
return 1.0 / (1.005 + 0.05 * exp(((float4) w) / 1.5 - 2));
}
static int
cnt_length(TSVector t)
{
WordEntry *ptr = ARRPTR(t),
*end = (WordEntry *) STRPTR(t);
int len = 0,
clen;
while (ptr < end)
{
if ((clen = POSDATALEN(t, ptr)) == 0)
len += 1;
else
len += clen;
ptr++;
}
return len;
}
static int4
WordECompareQueryItem(char *eval, char *qval, WordEntry * ptr, QueryItem * item)
{
if (ptr->len == item->length)
return strncmp(
eval + ptr->pos,
qval + item->distance,
item->length);
return (ptr->len > item->length) ? 1 : -1;
}
static WordEntry *
find_wordentry(TSVector t, TSQuery q, QueryItem * item)
{
WordEntry *StopLow = ARRPTR(t);
WordEntry *StopHigh = (WordEntry *) STRPTR(t);
WordEntry *StopMiddle;
int difference;
/* Loop invariant: StopLow <= item < StopHigh */
while (StopLow < StopHigh)
{
StopMiddle = StopLow + (StopHigh - StopLow) / 2;
difference = WordECompareQueryItem(STRPTR(t), GETOPERAND(q), StopMiddle, item);
if (difference == 0)
return StopMiddle;
else if (difference < 0)
StopLow = StopMiddle + 1;
else
StopHigh = StopMiddle;
}
return NULL;
}
static int
compareQueryItem(const void *a, const void *b, void *arg)
{
char *operand = (char *) arg;
if ((*(QueryItem **) a)->length == (*(QueryItem **) b)->length)
return strncmp(operand + (*(QueryItem **) a)->distance,
operand + (*(QueryItem **) b)->distance,
(*(QueryItem **) b)->length);
return ((*(QueryItem **) a)->length > (*(QueryItem **) b)->length) ? 1 : -1;
}
static QueryItem **
SortAndUniqItems(char *operand, QueryItem * item, int *size)
{
QueryItem **res,
**ptr,
**prevptr;
ptr = res = (QueryItem **) palloc(sizeof(QueryItem *) * *size);
while ((*size)--)
{
if (item->type == VAL)
{
*ptr = item;
ptr++;
}
item++;
}
*size = ptr - res;
if (*size < 2)
return res;
qsort_arg(res, *size, sizeof(QueryItem **), compareQueryItem, (void *) operand);
ptr = res + 1;
prevptr = res;
while (ptr - res < *size)
{
if (compareQueryItem((void *) ptr, (void *) prevptr, (void *) operand) != 0)
{
prevptr++;
*prevptr = *ptr;
}
ptr++;
}
*size = prevptr + 1 - res;
return res;
}
static WordEntryPos POSNULL[] = {
0,
0
};
static float
calc_rank_and(float *w, TSVector t, TSQuery q)
{
uint16 **pos;
int i,
k,
l,
p;
WordEntry *entry;
WordEntryPos *post,
*ct;
int4 dimt,
lenct,
dist;
float res = -1.0;
QueryItem **item;
int size = q->size;
item = SortAndUniqItems(GETOPERAND(q), GETQUERY(q), &size);
if (size < 2)
{
pfree(item);
return calc_rank_or(w, t, q);
}
pos = (uint16 **) palloc(sizeof(uint16 *) * q->size);
memset(pos, 0, sizeof(uint16 *) * q->size);
*(uint16 *) POSNULL = lengthof(POSNULL) - 1;
WEP_SETPOS(POSNULL[1], MAXENTRYPOS - 1);
for (i = 0; i < size; i++)
{
entry = find_wordentry(t, q, item[i]);
if (!entry)
continue;
if (entry->haspos)
pos[i] = (uint16 *) _POSDATAPTR(t, entry);
else
pos[i] = (uint16 *) POSNULL;
dimt = *(uint16 *) (pos[i]);
post = (WordEntryPos *) (pos[i] + 1);
for (k = 0; k < i; k++)
{
if (!pos[k])
continue;
lenct = *(uint16 *) (pos[k]);
ct = (WordEntryPos *) (pos[k] + 1);
for (l = 0; l < dimt; l++)
{
for (p = 0; p < lenct; p++)
{
dist = Abs((int) WEP_GETPOS(post[l]) - (int) WEP_GETPOS(ct[p]));
if (dist || (dist == 0 && (pos[i] == (uint16 *) POSNULL || pos[k] == (uint16 *) POSNULL)))
{
float curw;
if (!dist)
dist = MAXENTRYPOS;
curw = sqrt(wpos(post[l]) * wpos(ct[p]) * word_distance(dist));
res = (res < 0) ? curw : 1.0 - (1.0 - res) * (1.0 - curw);
}
}
}
}
}
pfree(pos);
pfree(item);
return res;
}
static float
calc_rank_or(float *w, TSVector t, TSQuery q)
{
WordEntry *entry;
WordEntryPos *post;
int4 dimt,
j,
i;
float res = 0.0;
QueryItem **item;
int size = q->size;
*(uint16 *) POSNULL = lengthof(POSNULL) - 1;
item = SortAndUniqItems(GETOPERAND(q), GETQUERY(q), &size);
for (i = 0; i < size; i++)
{
float resj,
wjm;
int4 jm;
entry = find_wordentry(t, q, item[i]);
if (!entry)
continue;
if (entry->haspos)
{
dimt = POSDATALEN(t, entry);
post = POSDATAPTR(t, entry);
}
else
{
dimt = *(uint16 *) POSNULL;
post = POSNULL + 1;
}
resj = 0.0;
wjm = -1.0;
jm = 0;
for (j = 0; j < dimt; j++)
{
resj = resj + wpos(post[j]) / ((j + 1) * (j + 1));
if (wpos(post[j]) > wjm)
{
wjm = wpos(post[j]);
jm = j;
}
}
/*
limit (sum(i/i^2),i->inf) = pi^2/6
resj = sum(wi/i^2),i=1,noccurence,
wi - should be sorted desc,
don't sort for now, just choose maximum weight. This should be corrected
Oleg Bartunov
*/
res = res + (wjm + resj - wjm / ((jm + 1) * (jm + 1))) / 1.64493406685;
}
if (size > 0)
res = res / size;
pfree(item);
return res;
}
static float
calc_rank(float *w, TSVector t, TSQuery q, int4 method)
{
QueryItem *item = GETQUERY(q);
float res = 0.0;
int len;
if (!t->size || !q->size)
return 0.0;
res = (item->type != VAL && item->val == (int4) '&') ?
calc_rank_and(w, t, q) : calc_rank_or(w, t, q);
if (res < 0)
res = 1e-20;
if ((method & RANK_NORM_LOGLENGTH) && t->size > 0)
res /= log((double) (cnt_length(t) + 1)) / log(2.0);
if (method & RANK_NORM_LENGTH)
{
len = cnt_length(t);
if (len > 0)
res /= (float) len;
}
if ((method & RANK_NORM_UNIQ) && t->size > 0)
res /= (float) (t->size);
if ((method & RANK_NORM_LOGUNIQ) && t->size > 0)
res /= log((double) (t->size + 1)) / log(2.0);
return res;
}
static float *
getWeights(ArrayType *win)
{
static float ws[lengthof(weights)];
int i;
float4 *arrdata;
if (win == 0)
return weights;
if (ARR_NDIM(win) != 1)
ereport(ERROR,
(errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
errmsg("array of weight must be one-dimensional")));
if (ArrayGetNItems(ARR_NDIM(win), ARR_DIMS(win)) < lengthof(weights))
ereport(ERROR,
(errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
errmsg("array of weight is too short")));
if (ARR_HASNULL(win))
ereport(ERROR,
(errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
errmsg("array of weight must not contain nulls")));
arrdata = (float4 *) ARR_DATA_PTR(win);
for (i = 0; i < lengthof(weights); i++)
{
ws[i] = (arrdata[i] >= 0) ? arrdata[i] : weights[i];
if (ws[i] > 1.0)
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("weight out of range")));
}
return ws;
}
Datum
ts_rank_wttf(PG_FUNCTION_ARGS)
{
ArrayType *win = (ArrayType *) PG_DETOAST_DATUM(PG_GETARG_DATUM(0));
TSVector txt = PG_GETARG_TSVECTOR(1);
TSQuery query = PG_GETARG_TSQUERY(2);
int method = PG_GETARG_INT32(3);
float res;
res = calc_rank(getWeights(win), txt, query, method);
PG_FREE_IF_COPY(win, 0);
PG_FREE_IF_COPY(txt, 1);
PG_FREE_IF_COPY(query, 2);
PG_RETURN_FLOAT4(res);
}
Datum
ts_rank_wtt(PG_FUNCTION_ARGS)
{
ArrayType *win = (ArrayType *) PG_DETOAST_DATUM(PG_GETARG_DATUM(0));
TSVector txt = PG_GETARG_TSVECTOR(1);
TSQuery query = PG_GETARG_TSQUERY(2);
float res;
res = calc_rank(getWeights(win), txt, query, DEF_NORM_METHOD);
PG_FREE_IF_COPY(win, 0);
PG_FREE_IF_COPY(txt, 1);
PG_FREE_IF_COPY(query, 2);
PG_RETURN_FLOAT4(res);
}
Datum
ts_rank_ttf(PG_FUNCTION_ARGS)
{
TSVector txt = PG_GETARG_TSVECTOR(0);
TSQuery query = PG_GETARG_TSQUERY(1);
int method = PG_GETARG_INT32(2);
float res;
res = calc_rank(getWeights(NULL), txt, query, method);
PG_FREE_IF_COPY(txt, 0);
PG_FREE_IF_COPY(query, 1);
PG_RETURN_FLOAT4(res);
}
Datum
ts_rank_tt(PG_FUNCTION_ARGS)
{
TSVector txt = PG_GETARG_TSVECTOR(0);
TSQuery query = PG_GETARG_TSQUERY(1);
float res;
res = calc_rank(getWeights(NULL), txt, query, DEF_NORM_METHOD);
PG_FREE_IF_COPY(txt, 0);
PG_FREE_IF_COPY(query, 1);
PG_RETURN_FLOAT4(res);
}
typedef struct
{
QueryItem **item;
int16 nitem;
bool needfree;
uint8 wclass;
int32 pos;
} DocRepresentation;
static int
compareDocR(const void *a, const void *b)
{
if (((DocRepresentation *) a)->pos == ((DocRepresentation *) b)->pos)
return 0;
return (((DocRepresentation *) a)->pos > ((DocRepresentation *) b)->pos) ? 1 : -1;
}
static bool
checkcondition_QueryItem(void *checkval, QueryItem * val)
{
return (bool) (val->istrue);
}
static void
reset_istrue_flag(TSQuery query)
{
QueryItem *item = GETQUERY(query);
int i;
/* reset istrue flag */
for (i = 0; i < query->size; i++)
{
if (item->type == VAL)
item->istrue = 0;
item++;
}
}
typedef struct
{
int pos;
int p;
int q;
DocRepresentation *begin;
DocRepresentation *end;
} Extention;
static bool
Cover(DocRepresentation * doc, int len, TSQuery query, Extention * ext)
{
DocRepresentation *ptr;
int lastpos = ext->pos;
int i;
bool found = false;
reset_istrue_flag(query);
ext->p = 0x7fffffff;
ext->q = 0;
ptr = doc + ext->pos;
/* find upper bound of cover from current position, move up */
while (ptr - doc < len)
{
for (i = 0; i < ptr->nitem; i++)
ptr->item[i]->istrue = 1;
if (TS_execute(GETQUERY(query), NULL, false, checkcondition_QueryItem))
{
if (ptr->pos > ext->q)
{
ext->q = ptr->pos;
ext->end = ptr;
lastpos = ptr - doc;
found = true;
}
break;
}
ptr++;
}
if (!found)
return false;
reset_istrue_flag(query);
ptr = doc + lastpos;
/* find lower bound of cover from founded upper bound, move down */
while (ptr >= doc + ext->pos)
{
for (i = 0; i < ptr->nitem; i++)
ptr->item[i]->istrue = 1;
if (TS_execute(GETQUERY(query), NULL, true, checkcondition_QueryItem))
{
if (ptr->pos < ext->p)
{
ext->begin = ptr;
ext->p = ptr->pos;
}
break;
}
ptr--;
}
if (ext->p <= ext->q)
{
/*
* set position for next try to next lexeme after begining of founded
* cover
*/
ext->pos = (ptr - doc) + 1;
return true;
}
ext->pos++;
return Cover(doc, len, query, ext);
}
static DocRepresentation *
get_docrep(TSVector txt, TSQuery query, int *doclen)
{
QueryItem *item = GETQUERY(query);
WordEntry *entry;
WordEntryPos *post;
int4 dimt,
j,
i;
int len = query->size * 4,
cur = 0;
DocRepresentation *doc;
char *operand;
*(uint16 *) POSNULL = lengthof(POSNULL) - 1;
doc = (DocRepresentation *) palloc(sizeof(DocRepresentation) * len);
operand = GETOPERAND(query);
reset_istrue_flag(query);
for (i = 0; i < query->size; i++)
{
if (item[i].type != VAL || item[i].istrue)
continue;
entry = find_wordentry(txt, query, &(item[i]));
if (!entry)
continue;
if (entry->haspos)
{
dimt = POSDATALEN(txt, entry);
post = POSDATAPTR(txt, entry);
}
else
{
dimt = *(uint16 *) POSNULL;
post = POSNULL + 1;
}
while (cur + dimt >= len)
{
len *= 2;
doc = (DocRepresentation *) repalloc(doc, sizeof(DocRepresentation) * len);
}
for (j = 0; j < dimt; j++)
{
if (j == 0)
{
QueryItem *kptr,
*iptr = item + i;
int k;
doc[cur].needfree = false;
doc[cur].nitem = 0;
doc[cur].item = (QueryItem **) palloc(sizeof(QueryItem *) * query->size);
for (k = 0; k < query->size; k++)
{
kptr = item + k;
if (k == i ||
(item[k].type == VAL &&
compareQueryItem(&kptr, &iptr, operand) == 0))
{
doc[cur].item[doc[cur].nitem] = item + k;
doc[cur].nitem++;
kptr->istrue = 1;
}
}
}
else
{
doc[cur].needfree = false;
doc[cur].nitem = doc[cur - 1].nitem;
doc[cur].item = doc[cur - 1].item;
}
doc[cur].pos = WEP_GETPOS(post[j]);
doc[cur].wclass = WEP_GETWEIGHT(post[j]);
cur++;
}
}
*doclen = cur;
if (cur > 0)
{
if (cur > 1)
qsort((void *) doc, cur, sizeof(DocRepresentation), compareDocR);
return doc;
}
pfree(doc);
return NULL;
}
static float4
calc_rank_cd(float4 *arrdata, TSVector txt, TSQuery query, int method)
{
DocRepresentation *doc;
int len,
i,
doclen = 0;
Extention ext;
double Wdoc = 0.0;
double invws[lengthof(weights)];
double SumDist = 0.0,
PrevExtPos = 0.0,
CurExtPos = 0.0;
int NExtent = 0;
for (i = 0; i < lengthof(weights); i++)
{
invws[i] = ((double) ((arrdata[i] >= 0) ? arrdata[i] : weights[i]));
if (invws[i] > 1.0)
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("weight out of range")));
invws[i] = 1.0 / invws[i];
}
doc = get_docrep(txt, query, &doclen);
if (!doc)
return 0.0;
MemSet(&ext, 0, sizeof(Extention));
while (Cover(doc, doclen, query, &ext))
{
double Cpos = 0.0;
double InvSum = 0.0;
int nNoise;
DocRepresentation *ptr = ext.begin;
while (ptr <= ext.end)
{
InvSum += invws[ptr->wclass];
ptr++;
}
Cpos = ((double) (ext.end - ext.begin + 1)) / InvSum;
/*
* if doc are big enough then ext.q may be equal to ext.p due to limit
* of posional information. In this case we approximate number of
* noise word as half cover's length
*/
nNoise = (ext.q - ext.p) - (ext.end - ext.begin);
if (nNoise < 0)
nNoise = (ext.end - ext.begin) / 2;
Wdoc += Cpos / ((double) (1 + nNoise));
CurExtPos = ((double) (ext.q + ext.p)) / 2.0;
if (NExtent > 0 && CurExtPos > PrevExtPos /* prevent devision by
* zero in a case of
multiple lexize */ )
SumDist += 1.0 / (CurExtPos - PrevExtPos);
PrevExtPos = CurExtPos;
NExtent++;
}
if ((method & RANK_NORM_LOGLENGTH) && txt->size > 0)
Wdoc /= log((double) (cnt_length(txt) + 1));
if (method & RANK_NORM_LENGTH)
{
len = cnt_length(txt);
if (len > 0)
Wdoc /= (double) len;
}
if ((method & RANK_NORM_EXTDIST) && SumDist > 0)
Wdoc /= ((double) NExtent) / SumDist;
if ((method & RANK_NORM_UNIQ) && txt->size > 0)
Wdoc /= (double) (txt->size);
if ((method & RANK_NORM_LOGUNIQ) && txt->size > 0)
Wdoc /= log((double) (txt->size + 1)) / log(2.0);
for (i = 0; i < doclen; i++)
if (doc[i].needfree)
pfree(doc[i].item);
pfree(doc);
return (float4) Wdoc;
}
Datum
ts_rankcd_wttf(PG_FUNCTION_ARGS)
{
ArrayType *win = (ArrayType *) PG_DETOAST_DATUM(PG_GETARG_DATUM(0));
TSVector txt = PG_GETARG_TSVECTOR(1);
TSQuery query = PG_GETARG_TSQUERY_COPY(2);
int method = PG_GETARG_INT32(3);
float res;
res = calc_rank_cd(getWeights(win), txt, query, method);
PG_FREE_IF_COPY(win, 0);
PG_FREE_IF_COPY(txt, 1);
PG_FREE_IF_COPY(query, 2);
PG_RETURN_FLOAT4(res);
}
Datum
ts_rankcd_wtt(PG_FUNCTION_ARGS)
{
ArrayType *win = (ArrayType *) PG_DETOAST_DATUM(PG_GETARG_DATUM(0));
TSVector txt = PG_GETARG_TSVECTOR(1);
TSQuery query = PG_GETARG_TSQUERY_COPY(2);
float res;
res = calc_rank_cd(getWeights(win), txt, query, DEF_NORM_METHOD);
PG_FREE_IF_COPY(win, 0);
PG_FREE_IF_COPY(txt, 1);
PG_FREE_IF_COPY(query, 2);
PG_RETURN_FLOAT4(res);
}
Datum
ts_rankcd_ttf(PG_FUNCTION_ARGS)
{
TSVector txt = PG_GETARG_TSVECTOR(0);
TSQuery query = PG_GETARG_TSQUERY_COPY(1);
int method = PG_GETARG_INT32(2);
float res;
res = calc_rank_cd(getWeights(NULL), txt, query, method);
PG_FREE_IF_COPY(txt, 0);
PG_FREE_IF_COPY(query, 1);
PG_RETURN_FLOAT4(res);
}
Datum
ts_rankcd_tt(PG_FUNCTION_ARGS)
{
TSVector txt = PG_GETARG_TSVECTOR(0);
TSQuery query = PG_GETARG_TSQUERY_COPY(1);
float res;
res = calc_rank_cd(getWeights(NULL), txt, query, DEF_NORM_METHOD);
PG_FREE_IF_COPY(txt, 0);
PG_FREE_IF_COPY(query, 1);
PG_RETURN_FLOAT4(res);
}