1
0
mirror of https://github.com/postgres/postgres.git synced 2025-04-22 23:02:54 +03:00
postgres/src/backend/utils/adt/tsvector_op.c
Tom Lane bb36c51fcd Fix several bugs in tsvectorin, including crash due to uninitialized field and
miscomputation of required palloc size.  The crash could only occur if the
input contained lexemes both with and without positions, which is probably not
common in practice.  The miscomputation would definitely result in wasted
space.  Also fix some inconsistent coding around alignment of strings and
positions in a tsvector value; these errors could also lead to crashes given
mixed with/without position data and a machine that's picky about alignment.
And be more careful about checking for overflow of string offsets.

Patch is only against HEAD --- I have not looked to see if same bugs are
in back-branch contrib/tsearch2 code.
2007-10-23 00:51:23 +00:00

1417 lines
31 KiB
C

/*-------------------------------------------------------------------------
*
* tsvector_op.c
* operations over tsvector
*
* Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
*
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/utils/adt/tsvector_op.c,v 1.6 2007/10/23 00:51:23 tgl Exp $
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "catalog/namespace.h"
#include "commands/trigger.h"
#include "executor/spi.h"
#include "funcapi.h"
#include "mb/pg_wchar.h"
#include "miscadmin.h"
#include "tsearch/ts_type.h"
#include "tsearch/ts_utils.h"
#include "utils/builtins.h"
#include "utils/lsyscache.h"
typedef struct
{
WordEntry *arrb;
WordEntry *arre;
char *values;
char *operand;
} CHKVAL;
typedef struct
{
uint32 cur;
TSVector stat;
} StatStorage;
typedef struct
{
uint32 len;
uint32 pos;
uint32 ndoc;
uint32 nentry;
} StatEntry;
typedef struct
{
int32 vl_len_; /* varlena header (do not touch directly!) */
int4 size;
int4 weight;
char data[1];
} tsstat;
#define STATHDRSIZE (sizeof(int4) * 4)
#define CALCSTATSIZE(x, lenstr) ( (x) * sizeof(StatEntry) + STATHDRSIZE + (lenstr) )
#define STATPTR(x) ( (StatEntry*) ( (char*)(x) + STATHDRSIZE ) )
#define STATSTRPTR(x) ( (char*)(x) + STATHDRSIZE + ( sizeof(StatEntry) * ((TSVector)(x))->size ) )
#define STATSTRSIZE(x) ( VARSIZE((TSVector)(x)) - STATHDRSIZE - ( sizeof(StatEntry) * ((TSVector)(x))->size ) )
static Datum tsvector_update_trigger(PG_FUNCTION_ARGS, bool config_column);
/*
* Order: haspos, len, word, for all positions (pos, weight)
*/
static int
silly_cmp_tsvector(const TSVector a, const TSVector b)
{
if (VARSIZE(a) < VARSIZE(b))
return -1;
else if (VARSIZE(a) > VARSIZE(b))
return 1;
else if (a->size < b->size)
return -1;
else if (a->size > b->size)
return 1;
else
{
WordEntry *aptr = ARRPTR(a);
WordEntry *bptr = ARRPTR(b);
int i = 0;
int res;
for (i = 0; i < a->size; i++)
{
if (aptr->haspos != bptr->haspos)
{
return (aptr->haspos > bptr->haspos) ? -1 : 1;
}
else if (aptr->len != bptr->len)
{
return (aptr->len > bptr->len) ? -1 : 1;
}
else if ((res = strncmp(STRPTR(a) + aptr->pos, STRPTR(b) + bptr->pos, bptr->len)) != 0)
{
return res;
}
else if (aptr->haspos)
{
WordEntryPos *ap = POSDATAPTR(a, aptr);
WordEntryPos *bp = POSDATAPTR(b, bptr);
int j;
if (POSDATALEN(a, aptr) != POSDATALEN(b, bptr))
return (POSDATALEN(a, aptr) > POSDATALEN(b, bptr)) ? -1 : 1;
for (j = 0; j < POSDATALEN(a, aptr); j++)
{
if (WEP_GETPOS(*ap) != WEP_GETPOS(*bp))
{
return (WEP_GETPOS(*ap) > WEP_GETPOS(*bp)) ? -1 : 1;
}
else if (WEP_GETWEIGHT(*ap) != WEP_GETWEIGHT(*bp))
{
return (WEP_GETWEIGHT(*ap) > WEP_GETWEIGHT(*bp)) ? -1 : 1;
}
ap++, bp++;
}
}
aptr++;
bptr++;
}
}
return 0;
}
#define TSVECTORCMPFUNC( type, action, ret ) \
Datum \
tsvector_##type(PG_FUNCTION_ARGS) \
{ \
TSVector a = PG_GETARG_TSVECTOR(0); \
TSVector b = PG_GETARG_TSVECTOR(1); \
int res = silly_cmp_tsvector(a, b); \
PG_FREE_IF_COPY(a,0); \
PG_FREE_IF_COPY(b,1); \
PG_RETURN_##ret( res action 0 ); \
}
TSVECTORCMPFUNC(lt, <, BOOL);
TSVECTORCMPFUNC(le, <=, BOOL);
TSVECTORCMPFUNC(eq, ==, BOOL);
TSVECTORCMPFUNC(ge, >=, BOOL);
TSVECTORCMPFUNC(gt, >, BOOL);
TSVECTORCMPFUNC(ne, !=, BOOL);
TSVECTORCMPFUNC(cmp, +, INT32);
Datum
tsvector_strip(PG_FUNCTION_ARGS)
{
TSVector in = PG_GETARG_TSVECTOR(0);
TSVector out;
int i,
len = 0;
WordEntry *arrin = ARRPTR(in),
*arrout;
char *cur;
for (i = 0; i < in->size; i++)
len += arrin[i].len;
len = CALCDATASIZE(in->size, len);
out = (TSVector) palloc0(len);
SET_VARSIZE(out, len);
out->size = in->size;
arrout = ARRPTR(out);
cur = STRPTR(out);
for (i = 0; i < in->size; i++)
{
memcpy(cur, STRPTR(in) + arrin[i].pos, arrin[i].len);
arrout[i].haspos = 0;
arrout[i].len = arrin[i].len;
arrout[i].pos = cur - STRPTR(out);
cur += arrout[i].len;
}
PG_FREE_IF_COPY(in, 0);
PG_RETURN_POINTER(out);
}
Datum
tsvector_length(PG_FUNCTION_ARGS)
{
TSVector in = PG_GETARG_TSVECTOR(0);
int4 ret = in->size;
PG_FREE_IF_COPY(in, 0);
PG_RETURN_INT32(ret);
}
Datum
tsvector_setweight(PG_FUNCTION_ARGS)
{
TSVector in = PG_GETARG_TSVECTOR(0);
char cw = PG_GETARG_CHAR(1);
TSVector out;
int i,
j;
WordEntry *entry;
WordEntryPos *p;
int w = 0;
switch (cw)
{
case 'A':
case 'a':
w = 3;
break;
case 'B':
case 'b':
w = 2;
break;
case 'C':
case 'c':
w = 1;
break;
case 'D':
case 'd':
w = 0;
break;
/* internal error */
default:
elog(ERROR, "unrecognized weight");
}
out = (TSVector) palloc(VARSIZE(in));
memcpy(out, in, VARSIZE(in));
entry = ARRPTR(out);
i = out->size;
while (i--)
{
if ((j = POSDATALEN(out, entry)) != 0)
{
p = POSDATAPTR(out, entry);
while (j--)
{
WEP_SETWEIGHT(*p, w);
p++;
}
}
entry++;
}
PG_FREE_IF_COPY(in, 0);
PG_RETURN_POINTER(out);
}
static int
compareEntry(char *ptra, WordEntry * a, char *ptrb, WordEntry * b)
{
if (a->len == b->len)
{
return strncmp(
ptra + a->pos,
ptrb + b->pos,
a->len);
}
return (a->len > b->len) ? 1 : -1;
}
/*
* Add positions from src to dest after offsetting them by maxpos.
* Return the number added (might be less than expected due to overflow)
*/
static int4
add_pos(TSVector src, WordEntry * srcptr,
TSVector dest, WordEntry * destptr,
int4 maxpos)
{
uint16 *clen = &_POSVECPTR(dest, destptr)->npos;
int i;
uint16 slen = POSDATALEN(src, srcptr),
startlen;
WordEntryPos *spos = POSDATAPTR(src, srcptr),
*dpos = POSDATAPTR(dest, destptr);
if (!destptr->haspos)
*clen = 0;
startlen = *clen;
for (i = 0;
i < slen && *clen < MAXNUMPOS &&
(*clen == 0 || WEP_GETPOS(dpos[*clen - 1]) != MAXENTRYPOS - 1);
i++)
{
WEP_SETWEIGHT(dpos[*clen], WEP_GETWEIGHT(spos[i]));
WEP_SETPOS(dpos[*clen], LIMITPOS(WEP_GETPOS(spos[i]) + maxpos));
(*clen)++;
}
if (*clen != startlen)
destptr->haspos = 1;
return *clen - startlen;
}
Datum
tsvector_concat(PG_FUNCTION_ARGS)
{
TSVector in1 = PG_GETARG_TSVECTOR(0);
TSVector in2 = PG_GETARG_TSVECTOR(1);
TSVector out;
WordEntry *ptr;
WordEntry *ptr1,
*ptr2;
WordEntryPos *p;
int maxpos = 0,
i,
j,
i1,
i2,
dataoff;
char *data,
*data1,
*data2;
ptr = ARRPTR(in1);
i = in1->size;
while (i--)
{
if ((j = POSDATALEN(in1, ptr)) != 0)
{
p = POSDATAPTR(in1, ptr);
while (j--)
{
if (WEP_GETPOS(*p) > maxpos)
maxpos = WEP_GETPOS(*p);
p++;
}
}
ptr++;
}
ptr1 = ARRPTR(in1);
ptr2 = ARRPTR(in2);
data1 = STRPTR(in1);
data2 = STRPTR(in2);
i1 = in1->size;
i2 = in2->size;
/* conservative estimate of space needed */
out = (TSVector) palloc0(VARSIZE(in1) + VARSIZE(in2));
SET_VARSIZE(out, VARSIZE(in1) + VARSIZE(in2));
out->size = in1->size + in2->size;
ptr = ARRPTR(out);
data = STRPTR(out);
dataoff = 0;
while (i1 && i2)
{
int cmp = compareEntry(data1, ptr1, data2, ptr2);
if (cmp < 0)
{ /* in1 first */
ptr->haspos = ptr1->haspos;
ptr->len = ptr1->len;
memcpy(data + dataoff, data1 + ptr1->pos, ptr1->len);
ptr->pos = dataoff;
dataoff += ptr1->len;
if (ptr->haspos)
{
dataoff = SHORTALIGN(dataoff);
memcpy(data + dataoff, _POSVECPTR(in1, ptr1), POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16));
dataoff += POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16);
}
ptr++;
ptr1++;
i1--;
}
else if (cmp > 0)
{ /* in2 first */
ptr->haspos = ptr2->haspos;
ptr->len = ptr2->len;
memcpy(data + dataoff, data2 + ptr2->pos, ptr2->len);
ptr->pos = dataoff;
dataoff += ptr2->len;
if (ptr->haspos)
{
int addlen = add_pos(in2, ptr2, out, ptr, maxpos);
if (addlen == 0)
ptr->haspos = 0;
else
{
dataoff = SHORTALIGN(dataoff);
dataoff += addlen * sizeof(WordEntryPos) + sizeof(uint16);
}
}
ptr++;
ptr2++;
i2--;
}
else
{
ptr->haspos = ptr1->haspos | ptr2->haspos;
ptr->len = ptr1->len;
memcpy(data + dataoff, data1 + ptr1->pos, ptr1->len);
ptr->pos = dataoff;
dataoff += ptr1->len;
if (ptr->haspos)
{
if (ptr1->haspos)
{
dataoff = SHORTALIGN(dataoff);
memcpy(data + dataoff, _POSVECPTR(in1, ptr1), POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16));
dataoff += POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16);
if (ptr2->haspos)
dataoff += add_pos(in2, ptr2, out, ptr, maxpos) * sizeof(WordEntryPos);
}
else /* must have ptr2->haspos */
{
int addlen = add_pos(in2, ptr2, out, ptr, maxpos);
if (addlen == 0)
ptr->haspos = 0;
else
{
dataoff = SHORTALIGN(dataoff);
dataoff += addlen * sizeof(WordEntryPos) + sizeof(uint16);
}
}
}
ptr++;
ptr1++;
ptr2++;
i1--;
i2--;
}
}
while (i1)
{
ptr->haspos = ptr1->haspos;
ptr->len = ptr1->len;
memcpy(data + dataoff, data1 + ptr1->pos, ptr1->len);
ptr->pos = dataoff;
dataoff += ptr1->len;
if (ptr->haspos)
{
dataoff = SHORTALIGN(dataoff);
memcpy(data + dataoff, _POSVECPTR(in1, ptr1), POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16));
dataoff += POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16);
}
ptr++;
ptr1++;
i1--;
}
while (i2)
{
ptr->haspos = ptr2->haspos;
ptr->len = ptr2->len;
memcpy(data + dataoff, data2 + ptr2->pos, ptr2->len);
ptr->pos = dataoff;
dataoff += ptr2->len;
if (ptr->haspos)
{
int addlen = add_pos(in2, ptr2, out, ptr, maxpos);
if (addlen == 0)
ptr->haspos = 0;
else
{
dataoff = SHORTALIGN(dataoff);
dataoff += addlen * sizeof(WordEntryPos) + sizeof(uint16);
}
}
ptr++;
ptr2++;
i2--;
}
/*
* Instead of checking each offset individually, we check for overflow
* of pos fields once at the end.
*/
if (dataoff > MAXSTRPOS)
ereport(ERROR,
(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
errmsg("string is too long for tsvector")));
out->size = ptr - ARRPTR(out);
SET_VARSIZE(out, CALCDATASIZE(out->size, dataoff));
if (data != STRPTR(out))
memmove(STRPTR(out), data, dataoff);
PG_FREE_IF_COPY(in1, 0);
PG_FREE_IF_COPY(in2, 1);
PG_RETURN_POINTER(out);
}
/*
* compare 2 string values
*/
static int4
ValCompare(CHKVAL * chkval, WordEntry * ptr, QueryOperand * item)
{
if (ptr->len == item->length)
return strncmp(
&(chkval->values[ptr->pos]),
&(chkval->operand[item->distance]),
item->length);
return (ptr->len > item->length) ? 1 : -1;
}
/*
* check weight info
*/
static bool
checkclass_str(CHKVAL *chkval, WordEntry *val, QueryOperand *item)
{
WordEntryPosVector *posvec;
WordEntryPos *ptr;
uint16 len;
posvec = (WordEntryPosVector *)
(chkval->values + SHORTALIGN(val->pos + val->len));
len = posvec->npos;
ptr = posvec->pos;
while (len--)
{
if (item->weight & (1 << WEP_GETWEIGHT(*ptr)))
return true;
ptr++;
}
return false;
}
/*
* is there value 'val' in array or not ?
*/
static bool
checkcondition_str(void *checkval, QueryOperand * val)
{
CHKVAL *chkval = (CHKVAL *) checkval;
WordEntry *StopLow = chkval->arrb;
WordEntry *StopHigh = chkval->arre;
WordEntry *StopMiddle;
int difference;
/* Loop invariant: StopLow <= val < StopHigh */
while (StopLow < StopHigh)
{
StopMiddle = StopLow + (StopHigh - StopLow) / 2;
difference = ValCompare(chkval, StopMiddle, val);
if (difference == 0)
return (val->weight && StopMiddle->haspos) ?
checkclass_str(chkval, StopMiddle, val) : true;
else if (difference < 0)
StopLow = StopMiddle + 1;
else
StopHigh = StopMiddle;
}
return (false);
}
/*
* check for boolean condition.
*
* if calcnot is false, NOT expressions are always evaluated to be true. This is used in ranking.
* checkval can be used to pass information to the callback. TS_execute doesn't
* do anything with it.
* chkcond is a callback function used to evaluate each VAL node in the query.
*
*/
bool
TS_execute(QueryItem * curitem, void *checkval, bool calcnot,
bool (*chkcond) (void *checkval, QueryOperand * val))
{
/* since this function recurses, it could be driven to stack overflow */
check_stack_depth();
if (curitem->type == QI_VAL)
return chkcond(checkval, (QueryOperand *) curitem);
switch(curitem->operator.oper)
{
case OP_NOT:
if (calcnot)
return !TS_execute(curitem + 1, checkval, calcnot, chkcond);
else
return true;
case OP_AND:
if (TS_execute(curitem + curitem->operator.left, checkval, calcnot, chkcond))
return TS_execute(curitem + 1, checkval, calcnot, chkcond);
else
return false;
case OP_OR:
if (TS_execute(curitem + curitem->operator.left, checkval, calcnot, chkcond))
return true;
else
return TS_execute(curitem + 1, checkval, calcnot, chkcond);
default:
elog(ERROR, "unknown operator %d", curitem->operator.oper);
}
/* not reachable, but keep compiler quiet */
return false;
}
/*
* boolean operations
*/
Datum
ts_match_qv(PG_FUNCTION_ARGS)
{
PG_RETURN_DATUM(DirectFunctionCall2(ts_match_vq,
PG_GETARG_DATUM(1),
PG_GETARG_DATUM(0)));
}
Datum
ts_match_vq(PG_FUNCTION_ARGS)
{
TSVector val = PG_GETARG_TSVECTOR(0);
TSQuery query = PG_GETARG_TSQUERY(1);
CHKVAL chkval;
bool result;
if (!val->size || !query->size)
{
PG_FREE_IF_COPY(val, 0);
PG_FREE_IF_COPY(query, 1);
PG_RETURN_BOOL(false);
}
chkval.arrb = ARRPTR(val);
chkval.arre = chkval.arrb + val->size;
chkval.values = STRPTR(val);
chkval.operand = GETOPERAND(query);
result = TS_execute(
GETQUERY(query),
&chkval,
true,
checkcondition_str
);
PG_FREE_IF_COPY(val, 0);
PG_FREE_IF_COPY(query, 1);
PG_RETURN_BOOL(result);
}
Datum
ts_match_tt(PG_FUNCTION_ARGS)
{
TSVector vector;
TSQuery query;
bool res;
vector = DatumGetTSVector(DirectFunctionCall1(to_tsvector,
PG_GETARG_DATUM(0)));
query = DatumGetTSQuery(DirectFunctionCall1(plainto_tsquery,
PG_GETARG_DATUM(1)));
res = DatumGetBool(DirectFunctionCall2(ts_match_vq,
TSVectorGetDatum(vector),
TSQueryGetDatum(query)));
pfree(vector);
pfree(query);
PG_RETURN_BOOL(res);
}
Datum
ts_match_tq(PG_FUNCTION_ARGS)
{
TSVector vector;
TSQuery query = PG_GETARG_TSQUERY(1);
bool res;
vector = DatumGetTSVector(DirectFunctionCall1(to_tsvector,
PG_GETARG_DATUM(0)));
res = DatumGetBool(DirectFunctionCall2(ts_match_vq,
TSVectorGetDatum(vector),
TSQueryGetDatum(query)));
pfree(vector);
PG_FREE_IF_COPY(query, 1);
PG_RETURN_BOOL(res);
}
/*
* ts_stat statistic function support
*/
/*
* Returns the number of positions in value 'wptr' within tsvector 'txt',
* that have a weight equal to one of the weights in 'weight' bitmask.
*/
static int
check_weight(TSVector txt, WordEntry * wptr, int8 weight)
{
int len = POSDATALEN(txt, wptr);
int num = 0;
WordEntryPos *ptr = POSDATAPTR(txt, wptr);
while (len--)
{
if (weight & (1 << WEP_GETWEIGHT(*ptr)))
num++;
ptr++;
}
return num;
}
static WordEntry **
SEI_realloc(WordEntry ** in, uint32 *len)
{
if (*len == 0 || in == NULL)
{
*len = 8;
in = palloc(sizeof(WordEntry *) * (*len));
}
else
{
*len *= 2;
in = repalloc(in, sizeof(WordEntry *) * (*len));
}
return in;
}
static int
compareStatWord(StatEntry * a, WordEntry * b, tsstat * stat, TSVector txt)
{
if (a->len == b->len)
return strncmp(
STATSTRPTR(stat) + a->pos,
STRPTR(txt) + b->pos,
a->len
);
return (a->len > b->len) ? 1 : -1;
}
static tsstat *
formstat(tsstat * stat, TSVector txt, WordEntry ** entry, uint32 len)
{
tsstat *newstat;
uint32 totallen,
nentry;
uint32 slen = 0;
WordEntry **ptr = entry;
char *curptr;
StatEntry *sptr,
*nptr;
while (ptr - entry < len)
{
slen += (*ptr)->len;
ptr++;
}
nentry = stat->size + len;
slen += STATSTRSIZE(stat);
totallen = CALCSTATSIZE(nentry, slen);
newstat = palloc(totallen);
SET_VARSIZE(newstat, totallen);
newstat->weight = stat->weight;
newstat->size = nentry;
memcpy(STATSTRPTR(newstat), STATSTRPTR(stat), STATSTRSIZE(stat));
curptr = STATSTRPTR(newstat) + STATSTRSIZE(stat);
ptr = entry;
sptr = STATPTR(stat);
nptr = STATPTR(newstat);
if (len == 1)
{
StatEntry *StopLow = STATPTR(stat);
StatEntry *StopHigh = (StatEntry *) STATSTRPTR(stat);
while (StopLow < StopHigh)
{
sptr = StopLow + (StopHigh - StopLow) / 2;
if (compareStatWord(sptr, *ptr, stat, txt) < 0)
StopLow = sptr + 1;
else
StopHigh = sptr;
}
nptr = STATPTR(newstat) + (StopLow - STATPTR(stat));
memcpy(STATPTR(newstat), STATPTR(stat), sizeof(StatEntry) * (StopLow - STATPTR(stat)));
if ((*ptr)->haspos)
nptr->nentry = (stat->weight) ? check_weight(txt, *ptr, stat->weight) : POSDATALEN(txt, *ptr);
else
nptr->nentry = 1;
nptr->ndoc = 1;
nptr->len = (*ptr)->len;
memcpy(curptr, STRPTR(txt) + (*ptr)->pos, nptr->len);
nptr->pos = curptr - STATSTRPTR(newstat);
memcpy(nptr + 1, StopLow, sizeof(StatEntry) * (((StatEntry *) STATSTRPTR(stat)) - StopLow));
}
else
{
while (sptr - STATPTR(stat) < stat->size && ptr - entry < len)
{
if (compareStatWord(sptr, *ptr, stat, txt) < 0)
{
memcpy(nptr, sptr, sizeof(StatEntry));
sptr++;
}
else
{
if ((*ptr)->haspos)
nptr->nentry = (stat->weight) ? check_weight(txt, *ptr, stat->weight) : POSDATALEN(txt, *ptr);
else
nptr->nentry = 1;
nptr->ndoc = 1;
nptr->len = (*ptr)->len;
memcpy(curptr, STRPTR(txt) + (*ptr)->pos, nptr->len);
nptr->pos = curptr - STATSTRPTR(newstat);
curptr += nptr->len;
ptr++;
}
nptr++;
}
memcpy(nptr, sptr, sizeof(StatEntry) * (stat->size - (sptr - STATPTR(stat))));
while (ptr - entry < len)
{
if ((*ptr)->haspos)
nptr->nentry = (stat->weight) ? check_weight(txt, *ptr, stat->weight) : POSDATALEN(txt, *ptr);
else
nptr->nentry = 1;
nptr->ndoc = 1;
nptr->len = (*ptr)->len;
memcpy(curptr, STRPTR(txt) + (*ptr)->pos, nptr->len);
nptr->pos = curptr - STATSTRPTR(newstat);
curptr += nptr->len;
ptr++;
nptr++;
}
}
return newstat;
}
/*
* This is written like a custom aggregate function, because the
* original plan was to do just that. Unfortunately, an aggregate function
* can't return a set, so that plan was abandoned. If that limitation is
* lifted in the future, ts_stat could be a real aggregate function so that
* you could use it like this:
*
* SELECT ts_stat(vector_column) FROM vector_table;
*
* where vector_column is a tsvector-type column in vector_table.
*/
static tsstat *
ts_accum(tsstat * stat, Datum data)
{
tsstat *newstat;
TSVector txt = DatumGetTSVector(data);
WordEntry **newentry = NULL;
uint32 len = 0,
cur = 0;
StatEntry *sptr;
WordEntry *wptr;
int n = 0;
if (stat == NULL)
{ /* Init in first */
stat = palloc(STATHDRSIZE);
SET_VARSIZE(stat, STATHDRSIZE);
stat->size = 0;
stat->weight = 0;
}
/* simple check of correctness */
if (txt == NULL || txt->size == 0)
{
if (txt != (TSVector) DatumGetPointer(data))
pfree(txt);
return stat;
}
sptr = STATPTR(stat);
wptr = ARRPTR(txt);
if (stat->size < 100 * txt->size)
{ /* merge */
while (sptr - STATPTR(stat) < stat->size && wptr - ARRPTR(txt) < txt->size)
{
int cmp = compareStatWord(sptr, wptr, stat, txt);
if (cmp < 0)
sptr++;
else if (cmp == 0)
{
if (stat->weight == 0)
{
sptr->ndoc++;
sptr->nentry += (wptr->haspos) ? POSDATALEN(txt, wptr) : 1;
}
else if (wptr->haspos && (n = check_weight(txt, wptr, stat->weight)) != 0)
{
sptr->ndoc++;
sptr->nentry += n;
}
sptr++;
wptr++;
}
else
{
if (stat->weight == 0 || check_weight(txt, wptr, stat->weight) != 0)
{
if (cur == len)
newentry = SEI_realloc(newentry, &len);
newentry[cur] = wptr;
cur++;
}
wptr++;
}
}
while (wptr - ARRPTR(txt) < txt->size)
{
if (stat->weight == 0 || check_weight(txt, wptr, stat->weight) != 0)
{
if (cur == len)
newentry = SEI_realloc(newentry, &len);
newentry[cur] = wptr;
cur++;
}
wptr++;
}
}
else
{ /* search */
while (wptr - ARRPTR(txt) < txt->size)
{
StatEntry *StopLow = STATPTR(stat);
StatEntry *StopHigh = (StatEntry *) STATSTRPTR(stat);
int cmp;
while (StopLow < StopHigh)
{
sptr = StopLow + (StopHigh - StopLow) / 2;
cmp = compareStatWord(sptr, wptr, stat, txt);
if (cmp == 0)
{
if (stat->weight == 0)
{
sptr->ndoc++;
sptr->nentry += (wptr->haspos) ? POSDATALEN(txt, wptr) : 1;
}
else if (wptr->haspos && (n = check_weight(txt, wptr, stat->weight)) != 0)
{
sptr->ndoc++;
sptr->nentry += n;
}
break;
}
else if (cmp < 0)
StopLow = sptr + 1;
else
StopHigh = sptr;
}
if (StopLow >= StopHigh)
{ /* not found */
if (stat->weight == 0 || check_weight(txt, wptr, stat->weight) != 0)
{
if (cur == len)
newentry = SEI_realloc(newentry, &len);
newentry[cur] = wptr;
cur++;
}
}
wptr++;
}
}
if (cur == 0)
{ /* no new words */
if (txt != (TSVector) DatumGetPointer(data))
pfree(txt);
return stat;
}
newstat = formstat(stat, txt, newentry, cur);
pfree(newentry);
if (txt != (TSVector) DatumGetPointer(data))
pfree(txt);
return newstat;
}
static void
ts_setup_firstcall(FunctionCallInfo fcinfo, FuncCallContext *funcctx,
tsstat * stat)
{
TupleDesc tupdesc;
MemoryContext oldcontext;
StatStorage *st;
oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
st = palloc(sizeof(StatStorage));
st->cur = 0;
st->stat = palloc(VARSIZE(stat));
memcpy(st->stat, stat, VARSIZE(stat));
funcctx->user_fctx = (void *) st;
tupdesc = CreateTemplateTupleDesc(3, false);
TupleDescInitEntry(tupdesc, (AttrNumber) 1, "word",
TEXTOID, -1, 0);
TupleDescInitEntry(tupdesc, (AttrNumber) 2, "ndoc",
INT4OID, -1, 0);
TupleDescInitEntry(tupdesc, (AttrNumber) 3, "nentry",
INT4OID, -1, 0);
funcctx->tuple_desc = BlessTupleDesc(tupdesc);
funcctx->attinmeta = TupleDescGetAttInMetadata(tupdesc);
MemoryContextSwitchTo(oldcontext);
}
static Datum
ts_process_call(FuncCallContext *funcctx)
{
StatStorage *st;
st = (StatStorage *) funcctx->user_fctx;
if (st->cur < st->stat->size)
{
Datum result;
char *values[3];
char ndoc[16];
char nentry[16];
StatEntry *entry = STATPTR(st->stat) + st->cur;
HeapTuple tuple;
values[0] = palloc(entry->len + 1);
memcpy(values[0], STATSTRPTR(st->stat) + entry->pos, entry->len);
(values[0])[entry->len] = '\0';
sprintf(ndoc, "%d", entry->ndoc);
values[1] = ndoc;
sprintf(nentry, "%d", entry->nentry);
values[2] = nentry;
tuple = BuildTupleFromCStrings(funcctx->attinmeta, values);
result = HeapTupleGetDatum(tuple);
pfree(values[0]);
st->cur++;
return result;
}
else
{
pfree(st->stat);
pfree(st);
}
return (Datum) 0;
}
static tsstat *
ts_stat_sql(text *txt, text *ws)
{
char *query = TextPGetCString(txt);
int i;
tsstat *newstat,
*stat;
bool isnull;
Portal portal;
void *plan;
if ((plan = SPI_prepare(query, 0, NULL)) == NULL)
/* internal error */
elog(ERROR, "SPI_prepare(\"%s\") failed", query);
if ((portal = SPI_cursor_open(NULL, plan, NULL, NULL, false)) == NULL)
/* internal error */
elog(ERROR, "SPI_cursor_open(\"%s\") failed", query);
SPI_cursor_fetch(portal, true, 100);
if (SPI_tuptable->tupdesc->natts != 1 ||
SPI_gettypeid(SPI_tuptable->tupdesc, 1) != TSVECTOROID)
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("ts_stat query must return one tsvector column")));
stat = palloc(STATHDRSIZE);
SET_VARSIZE(stat, STATHDRSIZE);
stat->size = 0;
stat->weight = 0;
if (ws)
{
char *buf;
buf = VARDATA(ws);
while (buf - VARDATA(ws) < VARSIZE(ws) - VARHDRSZ)
{
if (pg_mblen(buf) == 1)
{
switch (*buf)
{
case 'A':
case 'a':
stat->weight |= 1 << 3;
break;
case 'B':
case 'b':
stat->weight |= 1 << 2;
break;
case 'C':
case 'c':
stat->weight |= 1 << 1;
break;
case 'D':
case 'd':
stat->weight |= 1;
break;
default:
stat->weight |= 0;
}
}
buf += pg_mblen(buf);
}
}
while (SPI_processed > 0)
{
for (i = 0; i < SPI_processed; i++)
{
Datum data = SPI_getbinval(SPI_tuptable->vals[i], SPI_tuptable->tupdesc, 1, &isnull);
if (!isnull)
{
newstat = ts_accum(stat, data);
if (stat != newstat && stat)
pfree(stat);
stat = newstat;
}
}
SPI_freetuptable(SPI_tuptable);
SPI_cursor_fetch(portal, true, 100);
}
SPI_freetuptable(SPI_tuptable);
SPI_cursor_close(portal);
SPI_freeplan(plan);
pfree(query);
return stat;
}
Datum
ts_stat1(PG_FUNCTION_ARGS)
{
FuncCallContext *funcctx;
Datum result;
if (SRF_IS_FIRSTCALL())
{
tsstat *stat;
text *txt = PG_GETARG_TEXT_P(0);
funcctx = SRF_FIRSTCALL_INIT();
SPI_connect();
stat = ts_stat_sql(txt, NULL);
PG_FREE_IF_COPY(txt, 0);
ts_setup_firstcall(fcinfo, funcctx, stat);
SPI_finish();
}
funcctx = SRF_PERCALL_SETUP();
if ((result = ts_process_call(funcctx)) != (Datum) 0)
SRF_RETURN_NEXT(funcctx, result);
SRF_RETURN_DONE(funcctx);
}
Datum
ts_stat2(PG_FUNCTION_ARGS)
{
FuncCallContext *funcctx;
Datum result;
if (SRF_IS_FIRSTCALL())
{
tsstat *stat;
text *txt = PG_GETARG_TEXT_P(0);
text *ws = PG_GETARG_TEXT_P(1);
funcctx = SRF_FIRSTCALL_INIT();
SPI_connect();
stat = ts_stat_sql(txt, ws);
PG_FREE_IF_COPY(txt, 0);
PG_FREE_IF_COPY(ws, 1);
ts_setup_firstcall(fcinfo, funcctx, stat);
SPI_finish();
}
funcctx = SRF_PERCALL_SETUP();
if ((result = ts_process_call(funcctx)) != (Datum) 0)
SRF_RETURN_NEXT(funcctx, result);
SRF_RETURN_DONE(funcctx);
}
/* Check if datatype is TEXT or binary-equivalent to it */
static bool
istexttype(Oid typid)
{
/* varchar(n) and char(n) are binary-compatible with text */
if (typid==TEXTOID || typid==VARCHAROID || typid==BPCHAROID)
return true;
/* Allow domains over these types, too */
typid = getBaseType(typid);
if (typid==TEXTOID || typid==VARCHAROID || typid==BPCHAROID)
return true;
return false;
}
/*
* Triggers for automatic update of a tsvector column from text column(s)
*
* Trigger arguments are either
* name of tsvector col, name of tsconfig to use, name(s) of text col(s)
* name of tsvector col, name of regconfig col, name(s) of text col(s)
* ie, tsconfig can either be specified by name, or indirectly as the
* contents of a regconfig field in the row. If the name is used, it must
* be explicitly schema-qualified.
*/
Datum
tsvector_update_trigger_byid(PG_FUNCTION_ARGS)
{
return tsvector_update_trigger(fcinfo, false);
}
Datum
tsvector_update_trigger_bycolumn(PG_FUNCTION_ARGS)
{
return tsvector_update_trigger(fcinfo, true);
}
static Datum
tsvector_update_trigger(PG_FUNCTION_ARGS, bool config_column)
{
TriggerData *trigdata;
Trigger *trigger;
Relation rel;
HeapTuple rettuple = NULL;
int tsvector_attr_num,
i;
ParsedText prs;
Datum datum;
bool isnull;
text *txt;
Oid cfgId;
/* Check call context */
if (!CALLED_AS_TRIGGER(fcinfo)) /* internal error */
elog(ERROR, "tsvector_update_trigger: not fired by trigger manager");
trigdata = (TriggerData *) fcinfo->context;
if (TRIGGER_FIRED_FOR_STATEMENT(trigdata->tg_event))
elog(ERROR, "tsvector_update_trigger: can't process STATEMENT events");
if (TRIGGER_FIRED_AFTER(trigdata->tg_event))
elog(ERROR, "tsvector_update_trigger: must be fired BEFORE event");
if (TRIGGER_FIRED_BY_INSERT(trigdata->tg_event))
rettuple = trigdata->tg_trigtuple;
else if (TRIGGER_FIRED_BY_UPDATE(trigdata->tg_event))
rettuple = trigdata->tg_newtuple;
else
elog(ERROR, "tsvector_update_trigger: must be fired for INSERT or UPDATE");
trigger = trigdata->tg_trigger;
rel = trigdata->tg_relation;
if (trigger->tgnargs < 3)
elog(ERROR, "tsvector_update_trigger: arguments must be tsvector_field, ts_config, text_field1, ...)");
/* Find the target tsvector column */
tsvector_attr_num = SPI_fnumber(rel->rd_att, trigger->tgargs[0]);
if (tsvector_attr_num == SPI_ERROR_NOATTRIBUTE)
ereport(ERROR,
(errcode(ERRCODE_UNDEFINED_COLUMN),
errmsg("tsvector column \"%s\" does not exist",
trigger->tgargs[0])));
if (SPI_gettypeid(rel->rd_att, tsvector_attr_num) != TSVECTOROID)
ereport(ERROR,
(errcode(ERRCODE_DATATYPE_MISMATCH),
errmsg("column \"%s\" is not of tsvector type",
trigger->tgargs[0])));
/* Find the configuration to use */
if (config_column)
{
int config_attr_num;
config_attr_num = SPI_fnumber(rel->rd_att, trigger->tgargs[1]);
if (config_attr_num == SPI_ERROR_NOATTRIBUTE)
ereport(ERROR,
(errcode(ERRCODE_UNDEFINED_COLUMN),
errmsg("config column \"%s\" does not exist",
trigger->tgargs[1])));
if (SPI_gettypeid(rel->rd_att, config_attr_num) != REGCONFIGOID)
ereport(ERROR,
(errcode(ERRCODE_DATATYPE_MISMATCH),
errmsg("column \"%s\" is not of regconfig type",
trigger->tgargs[1])));
datum = SPI_getbinval(rettuple, rel->rd_att, config_attr_num, &isnull);
if (isnull)
ereport(ERROR,
(errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
errmsg("config column \"%s\" must not be NULL",
trigger->tgargs[1])));
cfgId = DatumGetObjectId(datum);
}
else
{
List *names;
names = stringToQualifiedNameList(trigger->tgargs[1]);
/* require a schema so that results are not search path dependent */
if (list_length(names) < 2)
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("text search configuration name \"%s\" must be schema-qualified",
trigger->tgargs[1])));
cfgId = TSConfigGetCfgid(names, false);
}
/* initialize parse state */
prs.lenwords = 32;
prs.curwords = 0;
prs.pos = 0;
prs.words = (ParsedWord *) palloc(sizeof(ParsedWord) * prs.lenwords);
/* find all words in indexable column(s) */
for (i = 2; i < trigger->tgnargs; i++)
{
int numattr;
numattr = SPI_fnumber(rel->rd_att, trigger->tgargs[i]);
if (numattr == SPI_ERROR_NOATTRIBUTE)
ereport(ERROR,
(errcode(ERRCODE_UNDEFINED_COLUMN),
errmsg("column \"%s\" does not exist",
trigger->tgargs[i])));
if (!istexttype(SPI_gettypeid(rel->rd_att, numattr)))
ereport(ERROR,
(errcode(ERRCODE_DATATYPE_MISMATCH),
errmsg("column \"%s\" is not of character type",
trigger->tgargs[i])));
datum = SPI_getbinval(rettuple, rel->rd_att, numattr, &isnull);
if (isnull)
continue;
txt = DatumGetTextP(datum);
parsetext(cfgId, &prs, VARDATA(txt), VARSIZE(txt) - VARHDRSZ);
if (txt != (text *) DatumGetPointer(datum))
pfree(txt);
}
/* make tsvector value */
if (prs.curwords)
{
datum = PointerGetDatum(make_tsvector(&prs));
rettuple = SPI_modifytuple(rel, rettuple, 1, &tsvector_attr_num,
&datum, NULL);
pfree(DatumGetPointer(datum));
}
else
{
TSVector out = palloc(CALCDATASIZE(0, 0));
SET_VARSIZE(out, CALCDATASIZE(0, 0));
out->size = 0;
datum = PointerGetDatum(out);
rettuple = SPI_modifytuple(rel, rettuple, 1, &tsvector_attr_num,
&datum, NULL);
pfree(prs.words);
}
if (rettuple == NULL) /* internal error */
elog(ERROR, "tsvector_update_trigger: %d returned by SPI_modifytuple",
SPI_result);
return PointerGetDatum(rettuple);
}