1
0
mirror of https://github.com/postgres/postgres.git synced 2025-06-14 18:42:34 +03:00
Files
postgres/src/backend/utils/adt/tsvector_op.c
Tom Lane 5a617d75d3 Fix ts_headline() to handle ORs and phrase queries more honestly.
This patch largely reverts what I did in commits c9b0c678d and
78e73e875.  The maximum cover length limit that I added in 78e73e875
(to band-aid over c9b0c678d's performance issues) creates too many
user-visible behavior discrepancies, as complained of for example in
bug #17691.  The real problem with hlCover() is not what I thought
at the time, but more that it seems to have been designed with only
AND tsquery semantics in mind.  It doesn't work quite right for OR,
and even less so for NOT or phrase queries.  However, we can improve
that situation by building a variant of TS_execute() that returns a
list of match locations.  We already get an ExecPhraseData struct
representing match locations for the primitive case of a simple match,
as well as one for a phrase match; we just need to add some logic to
combine these for AND and OR operators.  The result is a list of
ExecPhraseDatas, which hlCover can regard as having simple AND
semantics, so that its old algorithm works correctly.

There's still a lot not to like about ts_headline's behavior, but
I think the remaining issues have to do with the heuristics used
in mark_hl_words and mark_hl_fragments (which, likewise, were not
revisited when phrase search was added).  Improving those is a task
for another day.

Patch by me; thanks to Alvaro Herrera for review.

Discussion: https://postgr.es/m/840.1669405935@sss.pgh.pa.us
2023-01-19 16:21:44 -05:00

2894 lines
69 KiB
C

/*-------------------------------------------------------------------------
*
* tsvector_op.c
* operations over tsvector
*
* Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
*
*
* IDENTIFICATION
* src/backend/utils/adt/tsvector_op.c
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include <limits.h>
#include "access/htup_details.h"
#include "catalog/namespace.h"
#include "catalog/pg_type.h"
#include "commands/trigger.h"
#include "executor/spi.h"
#include "funcapi.h"
#include "lib/qunique.h"
#include "mb/pg_wchar.h"
#include "miscadmin.h"
#include "parser/parse_coerce.h"
#include "tsearch/ts_utils.h"
#include "utils/array.h"
#include "utils/builtins.h"
#include "utils/lsyscache.h"
#include "utils/regproc.h"
#include "utils/rel.h"
typedef struct
{
WordEntry *arrb;
WordEntry *arre;
char *values;
char *operand;
} CHKVAL;
typedef struct StatEntry
{
uint32 ndoc; /* zero indicates that we were already here
* while walking through the tree */
uint32 nentry;
struct StatEntry *left;
struct StatEntry *right;
uint32 lenlexeme;
char lexeme[FLEXIBLE_ARRAY_MEMBER];
} StatEntry;
#define STATENTRYHDRSZ (offsetof(StatEntry, lexeme))
typedef struct
{
int32 weight;
uint32 maxdepth;
StatEntry **stack;
uint32 stackpos;
StatEntry *root;
} TSVectorStat;
static TSTernaryValue TS_execute_recurse(QueryItem *curitem, void *arg,
uint32 flags,
TSExecuteCallback chkcond);
static bool TS_execute_locations_recurse(QueryItem *curitem,
void *arg,
TSExecuteCallback chkcond,
List **locations);
static int tsvector_bsearch(const TSVector tsv, char *lexeme, int lexeme_len);
static Datum tsvector_update_trigger(PG_FUNCTION_ARGS, bool config_column);
/*
* Order: haspos, len, word, for all positions (pos, weight)
*/
static int
silly_cmp_tsvector(const TSVector a, const TSVector b)
{
if (VARSIZE(a) < VARSIZE(b))
return -1;
else if (VARSIZE(a) > VARSIZE(b))
return 1;
else if (a->size < b->size)
return -1;
else if (a->size > b->size)
return 1;
else
{
WordEntry *aptr = ARRPTR(a);
WordEntry *bptr = ARRPTR(b);
int i = 0;
int res;
for (i = 0; i < a->size; i++)
{
if (aptr->haspos != bptr->haspos)
{
return (aptr->haspos > bptr->haspos) ? -1 : 1;
}
else if ((res = tsCompareString(STRPTR(a) + aptr->pos, aptr->len, STRPTR(b) + bptr->pos, bptr->len, false)) != 0)
{
return res;
}
else if (aptr->haspos)
{
WordEntryPos *ap = POSDATAPTR(a, aptr);
WordEntryPos *bp = POSDATAPTR(b, bptr);
int j;
if (POSDATALEN(a, aptr) != POSDATALEN(b, bptr))
return (POSDATALEN(a, aptr) > POSDATALEN(b, bptr)) ? -1 : 1;
for (j = 0; j < POSDATALEN(a, aptr); j++)
{
if (WEP_GETPOS(*ap) != WEP_GETPOS(*bp))
{
return (WEP_GETPOS(*ap) > WEP_GETPOS(*bp)) ? -1 : 1;
}
else if (WEP_GETWEIGHT(*ap) != WEP_GETWEIGHT(*bp))
{
return (WEP_GETWEIGHT(*ap) > WEP_GETWEIGHT(*bp)) ? -1 : 1;
}
ap++, bp++;
}
}
aptr++;
bptr++;
}
}
return 0;
}
#define TSVECTORCMPFUNC( type, action, ret ) \
Datum \
tsvector_##type(PG_FUNCTION_ARGS) \
{ \
TSVector a = PG_GETARG_TSVECTOR(0); \
TSVector b = PG_GETARG_TSVECTOR(1); \
int res = silly_cmp_tsvector(a, b); \
PG_FREE_IF_COPY(a,0); \
PG_FREE_IF_COPY(b,1); \
PG_RETURN_##ret( res action 0 ); \
} \
/* keep compiler quiet - no extra ; */ \
extern int no_such_variable
TSVECTORCMPFUNC(lt, <, BOOL);
TSVECTORCMPFUNC(le, <=, BOOL);
TSVECTORCMPFUNC(eq, ==, BOOL);
TSVECTORCMPFUNC(ge, >=, BOOL);
TSVECTORCMPFUNC(gt, >, BOOL);
TSVECTORCMPFUNC(ne, !=, BOOL);
TSVECTORCMPFUNC(cmp, +, INT32);
Datum
tsvector_strip(PG_FUNCTION_ARGS)
{
TSVector in = PG_GETARG_TSVECTOR(0);
TSVector out;
int i,
len = 0;
WordEntry *arrin = ARRPTR(in),
*arrout;
char *cur;
for (i = 0; i < in->size; i++)
len += arrin[i].len;
len = CALCDATASIZE(in->size, len);
out = (TSVector) palloc0(len);
SET_VARSIZE(out, len);
out->size = in->size;
arrout = ARRPTR(out);
cur = STRPTR(out);
for (i = 0; i < in->size; i++)
{
memcpy(cur, STRPTR(in) + arrin[i].pos, arrin[i].len);
arrout[i].haspos = 0;
arrout[i].len = arrin[i].len;
arrout[i].pos = cur - STRPTR(out);
cur += arrout[i].len;
}
PG_FREE_IF_COPY(in, 0);
PG_RETURN_POINTER(out);
}
Datum
tsvector_length(PG_FUNCTION_ARGS)
{
TSVector in = PG_GETARG_TSVECTOR(0);
int32 ret = in->size;
PG_FREE_IF_COPY(in, 0);
PG_RETURN_INT32(ret);
}
Datum
tsvector_setweight(PG_FUNCTION_ARGS)
{
TSVector in = PG_GETARG_TSVECTOR(0);
char cw = PG_GETARG_CHAR(1);
TSVector out;
int i,
j;
WordEntry *entry;
WordEntryPos *p;
int w = 0;
switch (cw)
{
case 'A':
case 'a':
w = 3;
break;
case 'B':
case 'b':
w = 2;
break;
case 'C':
case 'c':
w = 1;
break;
case 'D':
case 'd':
w = 0;
break;
default:
/* internal error */
elog(ERROR, "unrecognized weight: %d", cw);
}
out = (TSVector) palloc(VARSIZE(in));
memcpy(out, in, VARSIZE(in));
entry = ARRPTR(out);
i = out->size;
while (i--)
{
if ((j = POSDATALEN(out, entry)) != 0)
{
p = POSDATAPTR(out, entry);
while (j--)
{
WEP_SETWEIGHT(*p, w);
p++;
}
}
entry++;
}
PG_FREE_IF_COPY(in, 0);
PG_RETURN_POINTER(out);
}
/*
* setweight(tsin tsvector, char_weight "char", lexemes "text"[])
*
* Assign weight w to elements of tsin that are listed in lexemes.
*/
Datum
tsvector_setweight_by_filter(PG_FUNCTION_ARGS)
{
TSVector tsin = PG_GETARG_TSVECTOR(0);
char char_weight = PG_GETARG_CHAR(1);
ArrayType *lexemes = PG_GETARG_ARRAYTYPE_P(2);
TSVector tsout;
int i,
j,
nlexemes,
weight;
WordEntry *entry;
Datum *dlexemes;
bool *nulls;
switch (char_weight)
{
case 'A':
case 'a':
weight = 3;
break;
case 'B':
case 'b':
weight = 2;
break;
case 'C':
case 'c':
weight = 1;
break;
case 'D':
case 'd':
weight = 0;
break;
default:
/* internal error */
elog(ERROR, "unrecognized weight: %c", char_weight);
}
tsout = (TSVector) palloc(VARSIZE(tsin));
memcpy(tsout, tsin, VARSIZE(tsin));
entry = ARRPTR(tsout);
deconstruct_array_builtin(lexemes, TEXTOID, &dlexemes, &nulls, &nlexemes);
/*
* Assuming that lexemes array is significantly shorter than tsvector we
* can iterate through lexemes performing binary search of each lexeme
* from lexemes in tsvector.
*/
for (i = 0; i < nlexemes; i++)
{
char *lex;
int lex_len,
lex_pos;
/* Ignore null array elements, they surely don't match */
if (nulls[i])
continue;
lex = VARDATA(dlexemes[i]);
lex_len = VARSIZE(dlexemes[i]) - VARHDRSZ;
lex_pos = tsvector_bsearch(tsout, lex, lex_len);
if (lex_pos >= 0 && (j = POSDATALEN(tsout, entry + lex_pos)) != 0)
{
WordEntryPos *p = POSDATAPTR(tsout, entry + lex_pos);
while (j--)
{
WEP_SETWEIGHT(*p, weight);
p++;
}
}
}
PG_FREE_IF_COPY(tsin, 0);
PG_FREE_IF_COPY(lexemes, 2);
PG_RETURN_POINTER(tsout);
}
#define compareEntry(pa, a, pb, b) \
tsCompareString((pa) + (a)->pos, (a)->len, \
(pb) + (b)->pos, (b)->len, \
false)
/*
* Add positions from src to dest after offsetting them by maxpos.
* Return the number added (might be less than expected due to overflow)
*/
static int32
add_pos(TSVector src, WordEntry *srcptr,
TSVector dest, WordEntry *destptr,
int32 maxpos)
{
uint16 *clen = &_POSVECPTR(dest, destptr)->npos;
int i;
uint16 slen = POSDATALEN(src, srcptr),
startlen;
WordEntryPos *spos = POSDATAPTR(src, srcptr),
*dpos = POSDATAPTR(dest, destptr);
if (!destptr->haspos)
*clen = 0;
startlen = *clen;
for (i = 0;
i < slen && *clen < MAXNUMPOS &&
(*clen == 0 || WEP_GETPOS(dpos[*clen - 1]) != MAXENTRYPOS - 1);
i++)
{
WEP_SETWEIGHT(dpos[*clen], WEP_GETWEIGHT(spos[i]));
WEP_SETPOS(dpos[*clen], LIMITPOS(WEP_GETPOS(spos[i]) + maxpos));
(*clen)++;
}
if (*clen != startlen)
destptr->haspos = 1;
return *clen - startlen;
}
/*
* Perform binary search of given lexeme in TSVector.
* Returns lexeme position in TSVector's entry array or -1 if lexeme wasn't
* found.
*/
static int
tsvector_bsearch(const TSVector tsv, char *lexeme, int lexeme_len)
{
WordEntry *arrin = ARRPTR(tsv);
int StopLow = 0,
StopHigh = tsv->size,
StopMiddle,
cmp;
while (StopLow < StopHigh)
{
StopMiddle = (StopLow + StopHigh) / 2;
cmp = tsCompareString(lexeme, lexeme_len,
STRPTR(tsv) + arrin[StopMiddle].pos,
arrin[StopMiddle].len,
false);
if (cmp < 0)
StopHigh = StopMiddle;
else if (cmp > 0)
StopLow = StopMiddle + 1;
else /* found it */
return StopMiddle;
}
return -1;
}
/*
* qsort comparator functions
*/
static int
compare_int(const void *va, const void *vb)
{
int a = *((const int *) va);
int b = *((const int *) vb);
if (a == b)
return 0;
return (a > b) ? 1 : -1;
}
static int
compare_text_lexemes(const void *va, const void *vb)
{
Datum a = *((const Datum *) va);
Datum b = *((const Datum *) vb);
char *alex = VARDATA_ANY(a);
int alex_len = VARSIZE_ANY_EXHDR(a);
char *blex = VARDATA_ANY(b);
int blex_len = VARSIZE_ANY_EXHDR(b);
return tsCompareString(alex, alex_len, blex, blex_len, false);
}
/*
* Internal routine to delete lexemes from TSVector by array of offsets.
*
* int *indices_to_delete -- array of lexeme offsets to delete (modified here!)
* int indices_count -- size of that array
*
* Returns new TSVector without given lexemes along with their positions
* and weights.
*/
static TSVector
tsvector_delete_by_indices(TSVector tsv, int *indices_to_delete,
int indices_count)
{
TSVector tsout;
WordEntry *arrin = ARRPTR(tsv),
*arrout;
char *data = STRPTR(tsv),
*dataout;
int i, /* index in arrin */
j, /* index in arrout */
k, /* index in indices_to_delete */
curoff; /* index in dataout area */
/*
* Sort the filter array to simplify membership checks below. Also, get
* rid of any duplicate entries, so that we can assume that indices_count
* is exactly equal to the number of lexemes that will be removed.
*/
if (indices_count > 1)
{
qsort(indices_to_delete, indices_count, sizeof(int), compare_int);
indices_count = qunique(indices_to_delete, indices_count, sizeof(int),
compare_int);
}
/*
* Here we overestimate tsout size, since we don't know how much space is
* used by the deleted lexeme(s). We will set exact size below.
*/
tsout = (TSVector) palloc0(VARSIZE(tsv));
/* This count must be correct because STRPTR(tsout) relies on it. */
tsout->size = tsv->size - indices_count;
/*
* Copy tsv to tsout, skipping lexemes listed in indices_to_delete.
*/
arrout = ARRPTR(tsout);
dataout = STRPTR(tsout);
curoff = 0;
for (i = j = k = 0; i < tsv->size; i++)
{
/*
* If current i is present in indices_to_delete, skip this lexeme.
* Since indices_to_delete is already sorted, we only need to check
* the current (k'th) entry.
*/
if (k < indices_count && i == indices_to_delete[k])
{
k++;
continue;
}
/* Copy lexeme and its positions and weights */
memcpy(dataout + curoff, data + arrin[i].pos, arrin[i].len);
arrout[j].haspos = arrin[i].haspos;
arrout[j].len = arrin[i].len;
arrout[j].pos = curoff;
curoff += arrin[i].len;
if (arrin[i].haspos)
{
int len = POSDATALEN(tsv, arrin + i) * sizeof(WordEntryPos)
+ sizeof(uint16);
curoff = SHORTALIGN(curoff);
memcpy(dataout + curoff,
STRPTR(tsv) + SHORTALIGN(arrin[i].pos + arrin[i].len),
len);
curoff += len;
}
j++;
}
/*
* k should now be exactly equal to indices_count. If it isn't then the
* caller provided us with indices outside of [0, tsv->size) range and
* estimation of tsout's size is wrong.
*/
Assert(k == indices_count);
SET_VARSIZE(tsout, CALCDATASIZE(tsout->size, curoff));
return tsout;
}
/*
* Delete given lexeme from tsvector.
* Implementation of user-level ts_delete(tsvector, text).
*/
Datum
tsvector_delete_str(PG_FUNCTION_ARGS)
{
TSVector tsin = PG_GETARG_TSVECTOR(0),
tsout;
text *tlexeme = PG_GETARG_TEXT_PP(1);
char *lexeme = VARDATA_ANY(tlexeme);
int lexeme_len = VARSIZE_ANY_EXHDR(tlexeme),
skip_index;
if ((skip_index = tsvector_bsearch(tsin, lexeme, lexeme_len)) == -1)
PG_RETURN_POINTER(tsin);
tsout = tsvector_delete_by_indices(tsin, &skip_index, 1);
PG_FREE_IF_COPY(tsin, 0);
PG_FREE_IF_COPY(tlexeme, 1);
PG_RETURN_POINTER(tsout);
}
/*
* Delete given array of lexemes from tsvector.
* Implementation of user-level ts_delete(tsvector, text[]).
*/
Datum
tsvector_delete_arr(PG_FUNCTION_ARGS)
{
TSVector tsin = PG_GETARG_TSVECTOR(0),
tsout;
ArrayType *lexemes = PG_GETARG_ARRAYTYPE_P(1);
int i,
nlex,
skip_count,
*skip_indices;
Datum *dlexemes;
bool *nulls;
deconstruct_array_builtin(lexemes, TEXTOID, &dlexemes, &nulls, &nlex);
/*
* In typical use case array of lexemes to delete is relatively small. So
* here we optimize things for that scenario: iterate through lexarr
* performing binary search of each lexeme from lexarr in tsvector.
*/
skip_indices = palloc0(nlex * sizeof(int));
for (i = skip_count = 0; i < nlex; i++)
{
char *lex;
int lex_len,
lex_pos;
/* Ignore null array elements, they surely don't match */
if (nulls[i])
continue;
lex = VARDATA(dlexemes[i]);
lex_len = VARSIZE(dlexemes[i]) - VARHDRSZ;
lex_pos = tsvector_bsearch(tsin, lex, lex_len);
if (lex_pos >= 0)
skip_indices[skip_count++] = lex_pos;
}
tsout = tsvector_delete_by_indices(tsin, skip_indices, skip_count);
pfree(skip_indices);
PG_FREE_IF_COPY(tsin, 0);
PG_FREE_IF_COPY(lexemes, 1);
PG_RETURN_POINTER(tsout);
}
/*
* Expand tsvector as table with following columns:
* lexeme: lexeme text
* positions: integer array of lexeme positions
* weights: char array of weights corresponding to positions
*/
Datum
tsvector_unnest(PG_FUNCTION_ARGS)
{
FuncCallContext *funcctx;
TSVector tsin;
if (SRF_IS_FIRSTCALL())
{
MemoryContext oldcontext;
TupleDesc tupdesc;
funcctx = SRF_FIRSTCALL_INIT();
oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
tupdesc = CreateTemplateTupleDesc(3);
TupleDescInitEntry(tupdesc, (AttrNumber) 1, "lexeme",
TEXTOID, -1, 0);
TupleDescInitEntry(tupdesc, (AttrNumber) 2, "positions",
INT2ARRAYOID, -1, 0);
TupleDescInitEntry(tupdesc, (AttrNumber) 3, "weights",
TEXTARRAYOID, -1, 0);
if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
elog(ERROR, "return type must be a row type");
funcctx->tuple_desc = tupdesc;
funcctx->user_fctx = PG_GETARG_TSVECTOR_COPY(0);
MemoryContextSwitchTo(oldcontext);
}
funcctx = SRF_PERCALL_SETUP();
tsin = (TSVector) funcctx->user_fctx;
if (funcctx->call_cntr < tsin->size)
{
WordEntry *arrin = ARRPTR(tsin);
char *data = STRPTR(tsin);
HeapTuple tuple;
int j,
i = funcctx->call_cntr;
bool nulls[] = {false, false, false};
Datum values[3];
values[0] = PointerGetDatum(cstring_to_text_with_len(data + arrin[i].pos, arrin[i].len));
if (arrin[i].haspos)
{
WordEntryPosVector *posv;
Datum *positions;
Datum *weights;
char weight;
/*
* Internally tsvector stores position and weight in the same
* uint16 (2 bits for weight, 14 for position). Here we extract
* that in two separate arrays.
*/
posv = _POSVECPTR(tsin, arrin + i);
positions = palloc(posv->npos * sizeof(Datum));
weights = palloc(posv->npos * sizeof(Datum));
for (j = 0; j < posv->npos; j++)
{
positions[j] = Int16GetDatum(WEP_GETPOS(posv->pos[j]));
weight = 'D' - WEP_GETWEIGHT(posv->pos[j]);
weights[j] = PointerGetDatum(cstring_to_text_with_len(&weight,
1));
}
values[1] = PointerGetDatum(construct_array_builtin(positions, posv->npos, INT2OID));
values[2] = PointerGetDatum(construct_array_builtin(weights, posv->npos, TEXTOID));
}
else
{
nulls[1] = nulls[2] = true;
}
tuple = heap_form_tuple(funcctx->tuple_desc, values, nulls);
SRF_RETURN_NEXT(funcctx, HeapTupleGetDatum(tuple));
}
else
{
SRF_RETURN_DONE(funcctx);
}
}
/*
* Convert tsvector to array of lexemes.
*/
Datum
tsvector_to_array(PG_FUNCTION_ARGS)
{
TSVector tsin = PG_GETARG_TSVECTOR(0);
WordEntry *arrin = ARRPTR(tsin);
Datum *elements;
int i;
ArrayType *array;
elements = palloc(tsin->size * sizeof(Datum));
for (i = 0; i < tsin->size; i++)
{
elements[i] = PointerGetDatum(cstring_to_text_with_len(STRPTR(tsin) + arrin[i].pos,
arrin[i].len));
}
array = construct_array_builtin(elements, tsin->size, TEXTOID);
pfree(elements);
PG_FREE_IF_COPY(tsin, 0);
PG_RETURN_POINTER(array);
}
/*
* Build tsvector from array of lexemes.
*/
Datum
array_to_tsvector(PG_FUNCTION_ARGS)
{
ArrayType *v = PG_GETARG_ARRAYTYPE_P(0);
TSVector tsout;
Datum *dlexemes;
WordEntry *arrout;
bool *nulls;
int nitems,
i,
tslen,
datalen = 0;
char *cur;
deconstruct_array_builtin(v, TEXTOID, &dlexemes, &nulls, &nitems);
/*
* Reject nulls and zero length strings (maybe we should just ignore them,
* instead?)
*/
for (i = 0; i < nitems; i++)
{
if (nulls[i])
ereport(ERROR,
(errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
errmsg("lexeme array may not contain nulls")));
if (VARSIZE(dlexemes[i]) - VARHDRSZ == 0)
ereport(ERROR,
(errcode(ERRCODE_ZERO_LENGTH_CHARACTER_STRING),
errmsg("lexeme array may not contain empty strings")));
}
/* Sort and de-dup, because this is required for a valid tsvector. */
if (nitems > 1)
{
qsort(dlexemes, nitems, sizeof(Datum), compare_text_lexemes);
nitems = qunique(dlexemes, nitems, sizeof(Datum),
compare_text_lexemes);
}
/* Calculate space needed for surviving lexemes. */
for (i = 0; i < nitems; i++)
datalen += VARSIZE(dlexemes[i]) - VARHDRSZ;
tslen = CALCDATASIZE(nitems, datalen);
/* Allocate and fill tsvector. */
tsout = (TSVector) palloc0(tslen);
SET_VARSIZE(tsout, tslen);
tsout->size = nitems;
arrout = ARRPTR(tsout);
cur = STRPTR(tsout);
for (i = 0; i < nitems; i++)
{
char *lex = VARDATA(dlexemes[i]);
int lex_len = VARSIZE(dlexemes[i]) - VARHDRSZ;
memcpy(cur, lex, lex_len);
arrout[i].haspos = 0;
arrout[i].len = lex_len;
arrout[i].pos = cur - STRPTR(tsout);
cur += lex_len;
}
PG_FREE_IF_COPY(v, 0);
PG_RETURN_POINTER(tsout);
}
/*
* ts_filter(): keep only lexemes with given weights in tsvector.
*/
Datum
tsvector_filter(PG_FUNCTION_ARGS)
{
TSVector tsin = PG_GETARG_TSVECTOR(0),
tsout;
ArrayType *weights = PG_GETARG_ARRAYTYPE_P(1);
WordEntry *arrin = ARRPTR(tsin),
*arrout;
char *datain = STRPTR(tsin),
*dataout;
Datum *dweights;
bool *nulls;
int nweights;
int i,
j;
int cur_pos = 0;
char mask = 0;
deconstruct_array_builtin(weights, CHAROID, &dweights, &nulls, &nweights);
for (i = 0; i < nweights; i++)
{
char char_weight;
if (nulls[i])
ereport(ERROR,
(errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
errmsg("weight array may not contain nulls")));
char_weight = DatumGetChar(dweights[i]);
switch (char_weight)
{
case 'A':
case 'a':
mask = mask | 8;
break;
case 'B':
case 'b':
mask = mask | 4;
break;
case 'C':
case 'c':
mask = mask | 2;
break;
case 'D':
case 'd':
mask = mask | 1;
break;
default:
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("unrecognized weight: \"%c\"", char_weight)));
}
}
tsout = (TSVector) palloc0(VARSIZE(tsin));
tsout->size = tsin->size;
arrout = ARRPTR(tsout);
dataout = STRPTR(tsout);
for (i = j = 0; i < tsin->size; i++)
{
WordEntryPosVector *posvin,
*posvout;
int npos = 0;
int k;
if (!arrin[i].haspos)
continue;
posvin = _POSVECPTR(tsin, arrin + i);
posvout = (WordEntryPosVector *)
(dataout + SHORTALIGN(cur_pos + arrin[i].len));
for (k = 0; k < posvin->npos; k++)
{
if (mask & (1 << WEP_GETWEIGHT(posvin->pos[k])))
posvout->pos[npos++] = posvin->pos[k];
}
/* if no satisfactory positions found, skip lexeme */
if (!npos)
continue;
arrout[j].haspos = true;
arrout[j].len = arrin[i].len;
arrout[j].pos = cur_pos;
memcpy(dataout + cur_pos, datain + arrin[i].pos, arrin[i].len);
posvout->npos = npos;
cur_pos += SHORTALIGN(arrin[i].len);
cur_pos += POSDATALEN(tsout, arrout + j) * sizeof(WordEntryPos) +
sizeof(uint16);
j++;
}
tsout->size = j;
if (dataout != STRPTR(tsout))
memmove(STRPTR(tsout), dataout, cur_pos);
SET_VARSIZE(tsout, CALCDATASIZE(tsout->size, cur_pos));
PG_FREE_IF_COPY(tsin, 0);
PG_RETURN_POINTER(tsout);
}
Datum
tsvector_concat(PG_FUNCTION_ARGS)
{
TSVector in1 = PG_GETARG_TSVECTOR(0);
TSVector in2 = PG_GETARG_TSVECTOR(1);
TSVector out;
WordEntry *ptr;
WordEntry *ptr1,
*ptr2;
WordEntryPos *p;
int maxpos = 0,
i,
j,
i1,
i2,
dataoff,
output_bytes,
output_size;
char *data,
*data1,
*data2;
/* Get max position in in1; we'll need this to offset in2's positions */
ptr = ARRPTR(in1);
i = in1->size;
while (i--)
{
if ((j = POSDATALEN(in1, ptr)) != 0)
{
p = POSDATAPTR(in1, ptr);
while (j--)
{
if (WEP_GETPOS(*p) > maxpos)
maxpos = WEP_GETPOS(*p);
p++;
}
}
ptr++;
}
ptr1 = ARRPTR(in1);
ptr2 = ARRPTR(in2);
data1 = STRPTR(in1);
data2 = STRPTR(in2);
i1 = in1->size;
i2 = in2->size;
/*
* Conservative estimate of space needed. We might need all the data in
* both inputs, and conceivably add a pad byte before position data for
* each item where there was none before.
*/
output_bytes = VARSIZE(in1) + VARSIZE(in2) + i1 + i2;
out = (TSVector) palloc0(output_bytes);
SET_VARSIZE(out, output_bytes);
/*
* We must make out->size valid so that STRPTR(out) is sensible. We'll
* collapse out any unused space at the end.
*/
out->size = in1->size + in2->size;
ptr = ARRPTR(out);
data = STRPTR(out);
dataoff = 0;
while (i1 && i2)
{
int cmp = compareEntry(data1, ptr1, data2, ptr2);
if (cmp < 0)
{ /* in1 first */
ptr->haspos = ptr1->haspos;
ptr->len = ptr1->len;
memcpy(data + dataoff, data1 + ptr1->pos, ptr1->len);
ptr->pos = dataoff;
dataoff += ptr1->len;
if (ptr->haspos)
{
dataoff = SHORTALIGN(dataoff);
memcpy(data + dataoff, _POSVECPTR(in1, ptr1), POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16));
dataoff += POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16);
}
ptr++;
ptr1++;
i1--;
}
else if (cmp > 0)
{ /* in2 first */
ptr->haspos = ptr2->haspos;
ptr->len = ptr2->len;
memcpy(data + dataoff, data2 + ptr2->pos, ptr2->len);
ptr->pos = dataoff;
dataoff += ptr2->len;
if (ptr->haspos)
{
int addlen = add_pos(in2, ptr2, out, ptr, maxpos);
if (addlen == 0)
ptr->haspos = 0;
else
{
dataoff = SHORTALIGN(dataoff);
dataoff += addlen * sizeof(WordEntryPos) + sizeof(uint16);
}
}
ptr++;
ptr2++;
i2--;
}
else
{
ptr->haspos = ptr1->haspos | ptr2->haspos;
ptr->len = ptr1->len;
memcpy(data + dataoff, data1 + ptr1->pos, ptr1->len);
ptr->pos = dataoff;
dataoff += ptr1->len;
if (ptr->haspos)
{
if (ptr1->haspos)
{
dataoff = SHORTALIGN(dataoff);
memcpy(data + dataoff, _POSVECPTR(in1, ptr1), POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16));
dataoff += POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16);
if (ptr2->haspos)
dataoff += add_pos(in2, ptr2, out, ptr, maxpos) * sizeof(WordEntryPos);
}
else /* must have ptr2->haspos */
{
int addlen = add_pos(in2, ptr2, out, ptr, maxpos);
if (addlen == 0)
ptr->haspos = 0;
else
{
dataoff = SHORTALIGN(dataoff);
dataoff += addlen * sizeof(WordEntryPos) + sizeof(uint16);
}
}
}
ptr++;
ptr1++;
ptr2++;
i1--;
i2--;
}
}
while (i1)
{
ptr->haspos = ptr1->haspos;
ptr->len = ptr1->len;
memcpy(data + dataoff, data1 + ptr1->pos, ptr1->len);
ptr->pos = dataoff;
dataoff += ptr1->len;
if (ptr->haspos)
{
dataoff = SHORTALIGN(dataoff);
memcpy(data + dataoff, _POSVECPTR(in1, ptr1), POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16));
dataoff += POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16);
}
ptr++;
ptr1++;
i1--;
}
while (i2)
{
ptr->haspos = ptr2->haspos;
ptr->len = ptr2->len;
memcpy(data + dataoff, data2 + ptr2->pos, ptr2->len);
ptr->pos = dataoff;
dataoff += ptr2->len;
if (ptr->haspos)
{
int addlen = add_pos(in2, ptr2, out, ptr, maxpos);
if (addlen == 0)
ptr->haspos = 0;
else
{
dataoff = SHORTALIGN(dataoff);
dataoff += addlen * sizeof(WordEntryPos) + sizeof(uint16);
}
}
ptr++;
ptr2++;
i2--;
}
/*
* Instead of checking each offset individually, we check for overflow of
* pos fields once at the end.
*/
if (dataoff > MAXSTRPOS)
ereport(ERROR,
(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
errmsg("string is too long for tsvector (%d bytes, max %d bytes)", dataoff, MAXSTRPOS)));
/*
* Adjust sizes (asserting that we didn't overrun the original estimates)
* and collapse out any unused array entries.
*/
output_size = ptr - ARRPTR(out);
Assert(output_size <= out->size);
out->size = output_size;
if (data != STRPTR(out))
memmove(STRPTR(out), data, dataoff);
output_bytes = CALCDATASIZE(out->size, dataoff);
Assert(output_bytes <= VARSIZE(out));
SET_VARSIZE(out, output_bytes);
PG_FREE_IF_COPY(in1, 0);
PG_FREE_IF_COPY(in2, 1);
PG_RETURN_POINTER(out);
}
/*
* Compare two strings by tsvector rules.
*
* if prefix = true then it returns zero value iff b has prefix a
*/
int32
tsCompareString(char *a, int lena, char *b, int lenb, bool prefix)
{
int cmp;
if (lena == 0)
{
if (prefix)
cmp = 0; /* empty string is prefix of anything */
else
cmp = (lenb > 0) ? -1 : 0;
}
else if (lenb == 0)
{
cmp = (lena > 0) ? 1 : 0;
}
else
{
cmp = memcmp(a, b, Min((unsigned int) lena, (unsigned int) lenb));
if (prefix)
{
if (cmp == 0 && lena > lenb)
cmp = 1; /* a is longer, so not a prefix of b */
}
else if (cmp == 0 && lena != lenb)
{
cmp = (lena < lenb) ? -1 : 1;
}
}
return cmp;
}
/*
* Check weight info or/and fill 'data' with the required positions
*/
static TSTernaryValue
checkclass_str(CHKVAL *chkval, WordEntry *entry, QueryOperand *val,
ExecPhraseData *data)
{
TSTernaryValue result = TS_NO;
Assert(data == NULL || data->npos == 0);
if (entry->haspos)
{
WordEntryPosVector *posvec;
/*
* We can't use the _POSVECPTR macro here because the pointer to the
* tsvector's lexeme storage is already contained in chkval->values.
*/
posvec = (WordEntryPosVector *)
(chkval->values + SHORTALIGN(entry->pos + entry->len));
if (val->weight && data)
{
WordEntryPos *posvec_iter = posvec->pos;
WordEntryPos *dptr;
/*
* Filter position information by weights
*/
dptr = data->pos = palloc(sizeof(WordEntryPos) * posvec->npos);
data->allocated = true;
/* Is there a position with a matching weight? */
while (posvec_iter < posvec->pos + posvec->npos)
{
/* If true, append this position to the data->pos */
if (val->weight & (1 << WEP_GETWEIGHT(*posvec_iter)))
{
*dptr = WEP_GETPOS(*posvec_iter);
dptr++;
}
posvec_iter++;
}
data->npos = dptr - data->pos;
if (data->npos > 0)
result = TS_YES;
else
{
pfree(data->pos);
data->pos = NULL;
data->allocated = false;
}
}
else if (val->weight)
{
WordEntryPos *posvec_iter = posvec->pos;
/* Is there a position with a matching weight? */
while (posvec_iter < posvec->pos + posvec->npos)
{
if (val->weight & (1 << WEP_GETWEIGHT(*posvec_iter)))
{
result = TS_YES;
break; /* no need to go further */
}
posvec_iter++;
}
}
else if (data)
{
data->npos = posvec->npos;
data->pos = posvec->pos;
data->allocated = false;
result = TS_YES;
}
else
{
/* simplest case: no weight check, positions not needed */
result = TS_YES;
}
}
else
{
/*
* Position info is lacking, so if the caller requires it, we can only
* say that maybe there is a match.
*
* Notice, however, that we *don't* check val->weight here.
* Historically, stripped tsvectors are considered to match queries
* whether or not the query has a weight restriction; that's a little
* dubious but we'll preserve the behavior.
*/
if (data)
result = TS_MAYBE;
else
result = TS_YES;
}
return result;
}
/*
* TS_execute callback for matching a tsquery operand to plain tsvector data
*/
static TSTernaryValue
checkcondition_str(void *checkval, QueryOperand *val, ExecPhraseData *data)
{
CHKVAL *chkval = (CHKVAL *) checkval;
WordEntry *StopLow = chkval->arrb;
WordEntry *StopHigh = chkval->arre;
WordEntry *StopMiddle = StopHigh;
TSTernaryValue res = TS_NO;
/* Loop invariant: StopLow <= val < StopHigh */
while (StopLow < StopHigh)
{
int difference;
StopMiddle = StopLow + (StopHigh - StopLow) / 2;
difference = tsCompareString(chkval->operand + val->distance,
val->length,
chkval->values + StopMiddle->pos,
StopMiddle->len,
false);
if (difference == 0)
{
/* Check weight info & fill 'data' with positions */
res = checkclass_str(chkval, StopMiddle, val, data);
break;
}
else if (difference > 0)
StopLow = StopMiddle + 1;
else
StopHigh = StopMiddle;
}
/*
* If it's a prefix search, we should also consider lexemes that the
* search term is a prefix of (which will necessarily immediately follow
* the place we found in the above loop). But we can skip them if there
* was a definite match on the exact term AND the caller doesn't need
* position info.
*/
if (val->prefix && (res != TS_YES || data))
{
WordEntryPos *allpos = NULL;
int npos = 0,
totalpos = 0;
/* adjust start position for corner case */
if (StopLow >= StopHigh)
StopMiddle = StopHigh;
/* we don't try to re-use any data from the initial match */
if (data)
{
if (data->allocated)
pfree(data->pos);
data->pos = NULL;
data->allocated = false;
data->npos = 0;
}
res = TS_NO;
while ((res != TS_YES || data) &&
StopMiddle < chkval->arre &&
tsCompareString(chkval->operand + val->distance,
val->length,
chkval->values + StopMiddle->pos,
StopMiddle->len,
true) == 0)
{
TSTernaryValue subres;
subres = checkclass_str(chkval, StopMiddle, val, data);
if (subres != TS_NO)
{
if (data)
{
/*
* We need to join position information
*/
if (subres == TS_MAYBE)
{
/*
* No position info for this match, so we must report
* MAYBE overall.
*/
res = TS_MAYBE;
/* forget any previous positions */
npos = 0;
/* don't leak storage */
if (allpos)
pfree(allpos);
break;
}
while (npos + data->npos > totalpos)
{
if (totalpos == 0)
{
totalpos = 256;
allpos = palloc(sizeof(WordEntryPos) * totalpos);
}
else
{
totalpos *= 2;
allpos = repalloc(allpos, sizeof(WordEntryPos) * totalpos);
}
}
memcpy(allpos + npos, data->pos, sizeof(WordEntryPos) * data->npos);
npos += data->npos;
/* don't leak storage from individual matches */
if (data->allocated)
pfree(data->pos);
data->pos = NULL;
data->allocated = false;
/* it's important to reset data->npos before next loop */
data->npos = 0;
}
else
{
/* Don't need positions, just handle YES/MAYBE */
if (subres == TS_YES || res == TS_NO)
res = subres;
}
}
StopMiddle++;
}
if (data && npos > 0)
{
/* Sort and make unique array of found positions */
data->pos = allpos;
qsort(data->pos, npos, sizeof(WordEntryPos), compareWordEntryPos);
data->npos = qunique(data->pos, npos, sizeof(WordEntryPos),
compareWordEntryPos);
data->allocated = true;
res = TS_YES;
}
}
return res;
}
/*
* Compute output position list for a tsquery operator in phrase mode.
*
* Merge the position lists in Ldata and Rdata as specified by "emit",
* returning the result list into *data. The input position lists must be
* sorted and unique, and the output will be as well.
*
* data: pointer to initially-all-zeroes output struct, or NULL
* Ldata, Rdata: input position lists
* emit: bitmask of TSPO_XXX flags
* Loffset: offset to be added to Ldata positions before comparing/outputting
* Roffset: offset to be added to Rdata positions before comparing/outputting
* max_npos: maximum possible required size of output position array
*
* Loffset and Roffset should not be negative, else we risk trying to output
* negative positions, which won't fit into WordEntryPos.
*
* The result is boolean (TS_YES or TS_NO), but for the caller's convenience
* we return it as TSTernaryValue.
*
* Returns TS_YES if any positions were emitted to *data; or if data is NULL,
* returns TS_YES if any positions would have been emitted.
*/
#define TSPO_L_ONLY 0x01 /* emit positions appearing only in L */
#define TSPO_R_ONLY 0x02 /* emit positions appearing only in R */
#define TSPO_BOTH 0x04 /* emit positions appearing in both L&R */
static TSTernaryValue
TS_phrase_output(ExecPhraseData *data,
ExecPhraseData *Ldata,
ExecPhraseData *Rdata,
int emit,
int Loffset,
int Roffset,
int max_npos)
{
int Lindex,
Rindex;
/* Loop until both inputs are exhausted */
Lindex = Rindex = 0;
while (Lindex < Ldata->npos || Rindex < Rdata->npos)
{
int Lpos,
Rpos;
int output_pos = 0;
/*
* Fetch current values to compare. WEP_GETPOS() is needed because
* ExecPhraseData->data can point to a tsvector's WordEntryPosVector.
*/
if (Lindex < Ldata->npos)
Lpos = WEP_GETPOS(Ldata->pos[Lindex]) + Loffset;
else
{
/* L array exhausted, so we're done if R_ONLY isn't set */
if (!(emit & TSPO_R_ONLY))
break;
Lpos = INT_MAX;
}
if (Rindex < Rdata->npos)
Rpos = WEP_GETPOS(Rdata->pos[Rindex]) + Roffset;
else
{
/* R array exhausted, so we're done if L_ONLY isn't set */
if (!(emit & TSPO_L_ONLY))
break;
Rpos = INT_MAX;
}
/* Merge-join the two input lists */
if (Lpos < Rpos)
{
/* Lpos is not matched in Rdata, should we output it? */
if (emit & TSPO_L_ONLY)
output_pos = Lpos;
Lindex++;
}
else if (Lpos == Rpos)
{
/* Lpos and Rpos match ... should we output it? */
if (emit & TSPO_BOTH)
output_pos = Rpos;
Lindex++;
Rindex++;
}
else /* Lpos > Rpos */
{
/* Rpos is not matched in Ldata, should we output it? */
if (emit & TSPO_R_ONLY)
output_pos = Rpos;
Rindex++;
}
if (output_pos > 0)
{
if (data)
{
/* Store position, first allocating output array if needed */
if (data->pos == NULL)
{
data->pos = (WordEntryPos *)
palloc(max_npos * sizeof(WordEntryPos));
data->allocated = true;
}
data->pos[data->npos++] = output_pos;
}
else
{
/*
* Exact positions not needed, so return TS_YES as soon as we
* know there is at least one.
*/
return TS_YES;
}
}
}
if (data && data->npos > 0)
{
/* Let's assert we didn't overrun the array */
Assert(data->npos <= max_npos);
return TS_YES;
}
return TS_NO;
}
/*
* Execute tsquery at or below an OP_PHRASE operator.
*
* This handles tsquery execution at recursion levels where we need to care
* about match locations.
*
* In addition to the same arguments used for TS_execute, the caller may pass
* a preinitialized-to-zeroes ExecPhraseData struct, to be filled with lexeme
* match position info on success. data == NULL if no position data need be
* returned.
* Note: the function assumes data != NULL for operators other than OP_PHRASE.
* This is OK because an outside call always starts from an OP_PHRASE node,
* and all internal recursion cases pass data != NULL.
*
* The detailed semantics of the match data, given that the function returned
* TS_YES (successful match), are:
*
* npos > 0, negate = false:
* query is matched at specified position(s) (and only those positions)
* npos > 0, negate = true:
* query is matched at all positions *except* specified position(s)
* npos = 0, negate = true:
* query is matched at all positions
* npos = 0, negate = false:
* disallowed (this should result in TS_NO or TS_MAYBE, as appropriate)
*
* Successful matches also return a "width" value which is the match width in
* lexemes, less one. Hence, "width" is zero for simple one-lexeme matches,
* and is the sum of the phrase operator distances for phrase matches. Note
* that when width > 0, the listed positions represent the ends of matches not
* the starts. (This unintuitive rule is needed to avoid possibly generating
* negative positions, which wouldn't fit into the WordEntryPos arrays.)
*
* If the TSExecuteCallback function reports that an operand is present
* but fails to provide position(s) for it, we will return TS_MAYBE when
* it is possible but not certain that the query is matched.
*
* When the function returns TS_NO or TS_MAYBE, it must return npos = 0,
* negate = false (which is the state initialized by the caller); but the
* "width" output in such cases is undefined.
*/
static TSTernaryValue
TS_phrase_execute(QueryItem *curitem, void *arg, uint32 flags,
TSExecuteCallback chkcond,
ExecPhraseData *data)
{
ExecPhraseData Ldata,
Rdata;
TSTernaryValue lmatch,
rmatch;
int Loffset,
Roffset,
maxwidth;
/* since this function recurses, it could be driven to stack overflow */
check_stack_depth();
/* ... and let's check for query cancel while we're at it */
CHECK_FOR_INTERRUPTS();
if (curitem->type == QI_VAL)
return chkcond(arg, (QueryOperand *) curitem, data);
switch (curitem->qoperator.oper)
{
case OP_NOT:
/*
* We need not touch data->width, since a NOT operation does not
* change the match width.
*/
if (flags & TS_EXEC_SKIP_NOT)
{
/* with SKIP_NOT, report NOT as "match everywhere" */
Assert(data->npos == 0 && !data->negate);
data->negate = true;
return TS_YES;
}
switch (TS_phrase_execute(curitem + 1, arg, flags, chkcond, data))
{
case TS_NO:
/* change "match nowhere" to "match everywhere" */
Assert(data->npos == 0 && !data->negate);
data->negate = true;
return TS_YES;
case TS_YES:
if (data->npos > 0)
{
/* we have some positions, invert negate flag */
data->negate = !data->negate;
return TS_YES;
}
else if (data->negate)
{
/* change "match everywhere" to "match nowhere" */
data->negate = false;
return TS_NO;
}
/* Should not get here if result was TS_YES */
Assert(false);
break;
case TS_MAYBE:
/* match positions are, and remain, uncertain */
return TS_MAYBE;
}
break;
case OP_PHRASE:
case OP_AND:
memset(&Ldata, 0, sizeof(Ldata));
memset(&Rdata, 0, sizeof(Rdata));
lmatch = TS_phrase_execute(curitem + curitem->qoperator.left,
arg, flags, chkcond, &Ldata);
if (lmatch == TS_NO)
return TS_NO;
rmatch = TS_phrase_execute(curitem + 1,
arg, flags, chkcond, &Rdata);
if (rmatch == TS_NO)
return TS_NO;
/*
* If either operand has no position information, then we can't
* return reliable position data, only a MAYBE result.
*/
if (lmatch == TS_MAYBE || rmatch == TS_MAYBE)
return TS_MAYBE;
if (curitem->qoperator.oper == OP_PHRASE)
{
/*
* Compute Loffset and Roffset suitable for phrase match, and
* compute overall width of whole phrase match.
*/
Loffset = curitem->qoperator.distance + Rdata.width;
Roffset = 0;
if (data)
data->width = curitem->qoperator.distance +
Ldata.width + Rdata.width;
}
else
{
/*
* For OP_AND, set output width and alignment like OP_OR (see
* comment below)
*/
maxwidth = Max(Ldata.width, Rdata.width);
Loffset = maxwidth - Ldata.width;
Roffset = maxwidth - Rdata.width;
if (data)
data->width = maxwidth;
}
if (Ldata.negate && Rdata.negate)
{
/* !L & !R: treat as !(L | R) */
(void) TS_phrase_output(data, &Ldata, &Rdata,
TSPO_BOTH | TSPO_L_ONLY | TSPO_R_ONLY,
Loffset, Roffset,
Ldata.npos + Rdata.npos);
if (data)
data->negate = true;
return TS_YES;
}
else if (Ldata.negate)
{
/* !L & R */
return TS_phrase_output(data, &Ldata, &Rdata,
TSPO_R_ONLY,
Loffset, Roffset,
Rdata.npos);
}
else if (Rdata.negate)
{
/* L & !R */
return TS_phrase_output(data, &Ldata, &Rdata,
TSPO_L_ONLY,
Loffset, Roffset,
Ldata.npos);
}
else
{
/* straight AND */
return TS_phrase_output(data, &Ldata, &Rdata,
TSPO_BOTH,
Loffset, Roffset,
Min(Ldata.npos, Rdata.npos));
}
case OP_OR:
memset(&Ldata, 0, sizeof(Ldata));
memset(&Rdata, 0, sizeof(Rdata));
lmatch = TS_phrase_execute(curitem + curitem->qoperator.left,
arg, flags, chkcond, &Ldata);
rmatch = TS_phrase_execute(curitem + 1,
arg, flags, chkcond, &Rdata);
if (lmatch == TS_NO && rmatch == TS_NO)
return TS_NO;
/*
* If either operand has no position information, then we can't
* return reliable position data, only a MAYBE result.
*/
if (lmatch == TS_MAYBE || rmatch == TS_MAYBE)
return TS_MAYBE;
/*
* Cope with undefined output width from failed submatch. (This
* takes less code than trying to ensure that all failure returns
* set data->width to zero.)
*/
if (lmatch == TS_NO)
Ldata.width = 0;
if (rmatch == TS_NO)
Rdata.width = 0;
/*
* For OP_AND and OP_OR, report the width of the wider of the two
* inputs, and align the narrower input's positions to the right
* end of that width. This rule deals at least somewhat
* reasonably with cases like "x <-> (y | z <-> q)".
*/
maxwidth = Max(Ldata.width, Rdata.width);
Loffset = maxwidth - Ldata.width;
Roffset = maxwidth - Rdata.width;
data->width = maxwidth;
if (Ldata.negate && Rdata.negate)
{
/* !L | !R: treat as !(L & R) */
(void) TS_phrase_output(data, &Ldata, &Rdata,
TSPO_BOTH,
Loffset, Roffset,
Min(Ldata.npos, Rdata.npos));
data->negate = true;
return TS_YES;
}
else if (Ldata.negate)
{
/* !L | R: treat as !(L & !R) */
(void) TS_phrase_output(data, &Ldata, &Rdata,
TSPO_L_ONLY,
Loffset, Roffset,
Ldata.npos);
data->negate = true;
return TS_YES;
}
else if (Rdata.negate)
{
/* L | !R: treat as !(!L & R) */
(void) TS_phrase_output(data, &Ldata, &Rdata,
TSPO_R_ONLY,
Loffset, Roffset,
Rdata.npos);
data->negate = true;
return TS_YES;
}
else
{
/* straight OR */
return TS_phrase_output(data, &Ldata, &Rdata,
TSPO_BOTH | TSPO_L_ONLY | TSPO_R_ONLY,
Loffset, Roffset,
Ldata.npos + Rdata.npos);
}
default:
elog(ERROR, "unrecognized operator: %d", curitem->qoperator.oper);
}
/* not reachable, but keep compiler quiet */
return TS_NO;
}
/*
* Evaluate tsquery boolean expression.
*
* curitem: current tsquery item (initially, the first one)
* arg: opaque value to pass through to callback function
* flags: bitmask of flag bits shown in ts_utils.h
* chkcond: callback function to check whether a primitive value is present
*/
bool
TS_execute(QueryItem *curitem, void *arg, uint32 flags,
TSExecuteCallback chkcond)
{
/*
* If we get TS_MAYBE from the recursion, return true. We could only see
* that result if the caller passed TS_EXEC_PHRASE_NO_POS, so there's no
* need to check again.
*/
return TS_execute_recurse(curitem, arg, flags, chkcond) != TS_NO;
}
/*
* Evaluate tsquery boolean expression.
*
* This is the same as TS_execute except that TS_MAYBE is returned as-is.
*/
TSTernaryValue
TS_execute_ternary(QueryItem *curitem, void *arg, uint32 flags,
TSExecuteCallback chkcond)
{
return TS_execute_recurse(curitem, arg, flags, chkcond);
}
/*
* TS_execute recursion for operators above any phrase operator. Here we do
* not need to worry about lexeme positions. As soon as we hit an OP_PHRASE
* operator, we pass it off to TS_phrase_execute which does worry.
*/
static TSTernaryValue
TS_execute_recurse(QueryItem *curitem, void *arg, uint32 flags,
TSExecuteCallback chkcond)
{
TSTernaryValue lmatch;
/* since this function recurses, it could be driven to stack overflow */
check_stack_depth();
/* ... and let's check for query cancel while we're at it */
CHECK_FOR_INTERRUPTS();
if (curitem->type == QI_VAL)
return chkcond(arg, (QueryOperand *) curitem,
NULL /* don't need position info */ );
switch (curitem->qoperator.oper)
{
case OP_NOT:
if (flags & TS_EXEC_SKIP_NOT)
return TS_YES;
switch (TS_execute_recurse(curitem + 1, arg, flags, chkcond))
{
case TS_NO:
return TS_YES;
case TS_YES:
return TS_NO;
case TS_MAYBE:
return TS_MAYBE;
}
break;
case OP_AND:
lmatch = TS_execute_recurse(curitem + curitem->qoperator.left, arg,
flags, chkcond);
if (lmatch == TS_NO)
return TS_NO;
switch (TS_execute_recurse(curitem + 1, arg, flags, chkcond))
{
case TS_NO:
return TS_NO;
case TS_YES:
return lmatch;
case TS_MAYBE:
return TS_MAYBE;
}
break;
case OP_OR:
lmatch = TS_execute_recurse(curitem + curitem->qoperator.left, arg,
flags, chkcond);
if (lmatch == TS_YES)
return TS_YES;
switch (TS_execute_recurse(curitem + 1, arg, flags, chkcond))
{
case TS_NO:
return lmatch;
case TS_YES:
return TS_YES;
case TS_MAYBE:
return TS_MAYBE;
}
break;
case OP_PHRASE:
/*
* If we get a MAYBE result, and the caller doesn't want that,
* convert it to NO. It would be more consistent, perhaps, to
* return the result of TS_phrase_execute() verbatim and then
* convert MAYBE results at the top of the recursion. But
* converting at the topmost phrase operator gives results that
* are bug-compatible with the old implementation, so do it like
* this for now.
*/
switch (TS_phrase_execute(curitem, arg, flags, chkcond, NULL))
{
case TS_NO:
return TS_NO;
case TS_YES:
return TS_YES;
case TS_MAYBE:
return (flags & TS_EXEC_PHRASE_NO_POS) ? TS_MAYBE : TS_NO;
}
break;
default:
elog(ERROR, "unrecognized operator: %d", curitem->qoperator.oper);
}
/* not reachable, but keep compiler quiet */
return TS_NO;
}
/*
* Evaluate tsquery and report locations of matching terms.
*
* This is like TS_execute except that it returns match locations not just
* success/failure status. The callback function is required to provide
* position data (we report failure if it doesn't).
*
* On successful match, the result is a List of ExecPhraseData structs, one
* for each AND'ed term or phrase operator in the query. Each struct includes
* a sorted array of lexeme positions matching that term. (Recall that for
* phrase operators, the match includes width+1 lexemes, and the recorded
* position is that of the rightmost lexeme.)
*
* OR subexpressions are handled by union'ing their match locations into a
* single List element, which is valid since any of those locations contains
* a match. However, when some of the OR'ed terms are phrase operators, we
* report the maximum width of any of the OR'ed terms, making such cases
* slightly imprecise in the conservative direction. (For example, if the
* tsquery is "(A <-> B) | C", an occurrence of C in the data would be
* reported as though it includes the lexeme to the left of C.)
*
* Locations of NOT subexpressions are not reported. (Obviously, there can
* be no successful NOT matches at top level, or the match would have failed.
* So this amounts to ignoring NOTs underneath ORs.)
*
* The result is NIL if no match, or if position data was not returned.
*
* Arguments are the same as for TS_execute, although flags is currently
* vestigial since none of the defined bits are sensible here.
*/
List *
TS_execute_locations(QueryItem *curitem, void *arg,
uint32 flags,
TSExecuteCallback chkcond)
{
List *result;
/* No flags supported, as yet */
Assert(flags == TS_EXEC_EMPTY);
if (TS_execute_locations_recurse(curitem, arg, chkcond, &result))
return result;
return NIL;
}
/*
* TS_execute_locations recursion for operators above any phrase operator.
* OP_PHRASE subexpressions can be passed off to TS_phrase_execute.
*/
static bool
TS_execute_locations_recurse(QueryItem *curitem, void *arg,
TSExecuteCallback chkcond,
List **locations)
{
bool lmatch,
rmatch;
List *llocations,
*rlocations;
ExecPhraseData *data;
/* since this function recurses, it could be driven to stack overflow */
check_stack_depth();
/* ... and let's check for query cancel while we're at it */
CHECK_FOR_INTERRUPTS();
/* Default locations result is empty */
*locations = NIL;
if (curitem->type == QI_VAL)
{
data = palloc0_object(ExecPhraseData);
if (chkcond(arg, (QueryOperand *) curitem, data) == TS_YES)
{
*locations = list_make1(data);
return true;
}
pfree(data);
return false;
}
switch (curitem->qoperator.oper)
{
case OP_NOT:
if (!TS_execute_locations_recurse(curitem + 1, arg, chkcond,
&llocations))
return true; /* we don't pass back any locations */
return false;
case OP_AND:
if (!TS_execute_locations_recurse(curitem + curitem->qoperator.left,
arg, chkcond,
&llocations))
return false;
if (!TS_execute_locations_recurse(curitem + 1,
arg, chkcond,
&rlocations))
return false;
*locations = list_concat(llocations, rlocations);
return true;
case OP_OR:
lmatch = TS_execute_locations_recurse(curitem + curitem->qoperator.left,
arg, chkcond,
&llocations);
rmatch = TS_execute_locations_recurse(curitem + 1,
arg, chkcond,
&rlocations);
if (lmatch || rmatch)
{
/*
* We generate an AND'able location struct from each
* combination of sub-matches, following the disjunctive law
* (A & B) | (C & D) = (A | C) & (A | D) & (B | C) & (B | D).
*
* However, if either input didn't produce locations (i.e., it
* failed or was a NOT), we must just return the other list.
*/
if (llocations == NIL)
*locations = rlocations;
else if (rlocations == NIL)
*locations = llocations;
else
{
ListCell *ll;
foreach(ll, llocations)
{
ExecPhraseData *ldata = (ExecPhraseData *) lfirst(ll);
ListCell *lr;
foreach(lr, rlocations)
{
ExecPhraseData *rdata = (ExecPhraseData *) lfirst(lr);
data = palloc0_object(ExecPhraseData);
(void) TS_phrase_output(data, ldata, rdata,
TSPO_BOTH | TSPO_L_ONLY | TSPO_R_ONLY,
0, 0,
ldata->npos + rdata->npos);
/* Report the larger width, as explained above. */
data->width = Max(ldata->width, rdata->width);
*locations = lappend(*locations, data);
}
}
}
return true;
}
return false;
case OP_PHRASE:
/* We can hand this off to TS_phrase_execute */
data = palloc0_object(ExecPhraseData);
if (TS_phrase_execute(curitem, arg, TS_EXEC_EMPTY, chkcond,
data) == TS_YES)
{
if (!data->negate)
*locations = list_make1(data);
return true;
}
pfree(data);
return false;
default:
elog(ERROR, "unrecognized operator: %d", curitem->qoperator.oper);
}
/* not reachable, but keep compiler quiet */
return false;
}
/*
* Detect whether a tsquery boolean expression requires any positive matches
* to values shown in the tsquery.
*
* This is needed to know whether a GIN index search requires full index scan.
* For example, 'x & !y' requires a match of x, so it's sufficient to scan
* entries for x; but 'x | !y' could match rows containing neither x nor y.
*/
bool
tsquery_requires_match(QueryItem *curitem)
{
/* since this function recurses, it could be driven to stack overflow */
check_stack_depth();
if (curitem->type == QI_VAL)
return true;
switch (curitem->qoperator.oper)
{
case OP_NOT:
/*
* Assume there are no required matches underneath a NOT. For
* some cases with nested NOTs, we could prove there's a required
* match, but it seems unlikely to be worth the trouble.
*/
return false;
case OP_PHRASE:
/*
* Treat OP_PHRASE as OP_AND here
*/
case OP_AND:
/* If either side requires a match, we're good */
if (tsquery_requires_match(curitem + curitem->qoperator.left))
return true;
else
return tsquery_requires_match(curitem + 1);
case OP_OR:
/* Both sides must require a match */
if (tsquery_requires_match(curitem + curitem->qoperator.left))
return tsquery_requires_match(curitem + 1);
else
return false;
default:
elog(ERROR, "unrecognized operator: %d", curitem->qoperator.oper);
}
/* not reachable, but keep compiler quiet */
return false;
}
/*
* boolean operations
*/
Datum
ts_match_qv(PG_FUNCTION_ARGS)
{
PG_RETURN_DATUM(DirectFunctionCall2(ts_match_vq,
PG_GETARG_DATUM(1),
PG_GETARG_DATUM(0)));
}
Datum
ts_match_vq(PG_FUNCTION_ARGS)
{
TSVector val = PG_GETARG_TSVECTOR(0);
TSQuery query = PG_GETARG_TSQUERY(1);
CHKVAL chkval;
bool result;
/* empty query matches nothing */
if (!query->size)
{
PG_FREE_IF_COPY(val, 0);
PG_FREE_IF_COPY(query, 1);
PG_RETURN_BOOL(false);
}
chkval.arrb = ARRPTR(val);
chkval.arre = chkval.arrb + val->size;
chkval.values = STRPTR(val);
chkval.operand = GETOPERAND(query);
result = TS_execute(GETQUERY(query),
&chkval,
TS_EXEC_EMPTY,
checkcondition_str);
PG_FREE_IF_COPY(val, 0);
PG_FREE_IF_COPY(query, 1);
PG_RETURN_BOOL(result);
}
Datum
ts_match_tt(PG_FUNCTION_ARGS)
{
TSVector vector;
TSQuery query;
bool res;
vector = DatumGetTSVector(DirectFunctionCall1(to_tsvector,
PG_GETARG_DATUM(0)));
query = DatumGetTSQuery(DirectFunctionCall1(plainto_tsquery,
PG_GETARG_DATUM(1)));
res = DatumGetBool(DirectFunctionCall2(ts_match_vq,
TSVectorGetDatum(vector),
TSQueryGetDatum(query)));
pfree(vector);
pfree(query);
PG_RETURN_BOOL(res);
}
Datum
ts_match_tq(PG_FUNCTION_ARGS)
{
TSVector vector;
TSQuery query = PG_GETARG_TSQUERY(1);
bool res;
vector = DatumGetTSVector(DirectFunctionCall1(to_tsvector,
PG_GETARG_DATUM(0)));
res = DatumGetBool(DirectFunctionCall2(ts_match_vq,
TSVectorGetDatum(vector),
TSQueryGetDatum(query)));
pfree(vector);
PG_FREE_IF_COPY(query, 1);
PG_RETURN_BOOL(res);
}
/*
* ts_stat statistic function support
*/
/*
* Returns the number of positions in value 'wptr' within tsvector 'txt',
* that have a weight equal to one of the weights in 'weight' bitmask.
*/
static int
check_weight(TSVector txt, WordEntry *wptr, int8 weight)
{
int len = POSDATALEN(txt, wptr);
int num = 0;
WordEntryPos *ptr = POSDATAPTR(txt, wptr);
while (len--)
{
if (weight & (1 << WEP_GETWEIGHT(*ptr)))
num++;
ptr++;
}
return num;
}
#define compareStatWord(a,e,t) \
tsCompareString((a)->lexeme, (a)->lenlexeme, \
STRPTR(t) + (e)->pos, (e)->len, \
false)
static void
insertStatEntry(MemoryContext persistentContext, TSVectorStat *stat, TSVector txt, uint32 off)
{
WordEntry *we = ARRPTR(txt) + off;
StatEntry *node = stat->root,
*pnode = NULL;
int n,
res = 0;
uint32 depth = 1;
if (stat->weight == 0)
n = (we->haspos) ? POSDATALEN(txt, we) : 1;
else
n = (we->haspos) ? check_weight(txt, we, stat->weight) : 0;
if (n == 0)
return; /* nothing to insert */
while (node)
{
res = compareStatWord(node, we, txt);
if (res == 0)
{
break;
}
else
{
pnode = node;
node = (res < 0) ? node->left : node->right;
}
depth++;
}
if (depth > stat->maxdepth)
stat->maxdepth = depth;
if (node == NULL)
{
node = MemoryContextAlloc(persistentContext, STATENTRYHDRSZ + we->len);
node->left = node->right = NULL;
node->ndoc = 1;
node->nentry = n;
node->lenlexeme = we->len;
memcpy(node->lexeme, STRPTR(txt) + we->pos, node->lenlexeme);
if (pnode == NULL)
{
stat->root = node;
}
else
{
if (res < 0)
pnode->left = node;
else
pnode->right = node;
}
}
else
{
node->ndoc++;
node->nentry += n;
}
}
static void
chooseNextStatEntry(MemoryContext persistentContext, TSVectorStat *stat, TSVector txt,
uint32 low, uint32 high, uint32 offset)
{
uint32 pos;
uint32 middle = (low + high) >> 1;
pos = (low + middle) >> 1;
if (low != middle && pos >= offset && pos - offset < txt->size)
insertStatEntry(persistentContext, stat, txt, pos - offset);
pos = (high + middle + 1) >> 1;
if (middle + 1 != high && pos >= offset && pos - offset < txt->size)
insertStatEntry(persistentContext, stat, txt, pos - offset);
if (low != middle)
chooseNextStatEntry(persistentContext, stat, txt, low, middle, offset);
if (high != middle + 1)
chooseNextStatEntry(persistentContext, stat, txt, middle + 1, high, offset);
}
/*
* This is written like a custom aggregate function, because the
* original plan was to do just that. Unfortunately, an aggregate function
* can't return a set, so that plan was abandoned. If that limitation is
* lifted in the future, ts_stat could be a real aggregate function so that
* you could use it like this:
*
* SELECT ts_stat(vector_column) FROM vector_table;
*
* where vector_column is a tsvector-type column in vector_table.
*/
static TSVectorStat *
ts_accum(MemoryContext persistentContext, TSVectorStat *stat, Datum data)
{
TSVector txt = DatumGetTSVector(data);
uint32 i,
nbit = 0,
offset;
if (stat == NULL)
{ /* Init in first */
stat = MemoryContextAllocZero(persistentContext, sizeof(TSVectorStat));
stat->maxdepth = 1;
}
/* simple check of correctness */
if (txt == NULL || txt->size == 0)
{
if (txt && txt != (TSVector) DatumGetPointer(data))
pfree(txt);
return stat;
}
i = txt->size - 1;
for (; i > 0; i >>= 1)
nbit++;
nbit = 1 << nbit;
offset = (nbit - txt->size) / 2;
insertStatEntry(persistentContext, stat, txt, (nbit >> 1) - offset);
chooseNextStatEntry(persistentContext, stat, txt, 0, nbit, offset);
return stat;
}
static void
ts_setup_firstcall(FunctionCallInfo fcinfo, FuncCallContext *funcctx,
TSVectorStat *stat)
{
TupleDesc tupdesc;
MemoryContext oldcontext;
StatEntry *node;
funcctx->user_fctx = (void *) stat;
oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
stat->stack = palloc0(sizeof(StatEntry *) * (stat->maxdepth + 1));
stat->stackpos = 0;
node = stat->root;
/* find leftmost value */
if (node == NULL)
stat->stack[stat->stackpos] = NULL;
else
for (;;)
{
stat->stack[stat->stackpos] = node;
if (node->left)
{
stat->stackpos++;
node = node->left;
}
else
break;
}
Assert(stat->stackpos <= stat->maxdepth);
if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
elog(ERROR, "return type must be a row type");
funcctx->tuple_desc = tupdesc;
funcctx->attinmeta = TupleDescGetAttInMetadata(tupdesc);
MemoryContextSwitchTo(oldcontext);
}
static StatEntry *
walkStatEntryTree(TSVectorStat *stat)
{
StatEntry *node = stat->stack[stat->stackpos];
if (node == NULL)
return NULL;
if (node->ndoc != 0)
{
/* return entry itself: we already was at left sublink */
return node;
}
else if (node->right && node->right != stat->stack[stat->stackpos + 1])
{
/* go on right sublink */
stat->stackpos++;
node = node->right;
/* find most-left value */
for (;;)
{
stat->stack[stat->stackpos] = node;
if (node->left)
{
stat->stackpos++;
node = node->left;
}
else
break;
}
Assert(stat->stackpos <= stat->maxdepth);
}
else
{
/* we already return all left subtree, itself and right subtree */
if (stat->stackpos == 0)
return NULL;
stat->stackpos--;
return walkStatEntryTree(stat);
}
return node;
}
static Datum
ts_process_call(FuncCallContext *funcctx)
{
TSVectorStat *st;
StatEntry *entry;
st = (TSVectorStat *) funcctx->user_fctx;
entry = walkStatEntryTree(st);
if (entry != NULL)
{
Datum result;
char *values[3];
char ndoc[16];
char nentry[16];
HeapTuple tuple;
values[0] = palloc(entry->lenlexeme + 1);
memcpy(values[0], entry->lexeme, entry->lenlexeme);
(values[0])[entry->lenlexeme] = '\0';
sprintf(ndoc, "%d", entry->ndoc);
values[1] = ndoc;
sprintf(nentry, "%d", entry->nentry);
values[2] = nentry;
tuple = BuildTupleFromCStrings(funcctx->attinmeta, values);
result = HeapTupleGetDatum(tuple);
pfree(values[0]);
/* mark entry as already visited */
entry->ndoc = 0;
return result;
}
return (Datum) 0;
}
static TSVectorStat *
ts_stat_sql(MemoryContext persistentContext, text *txt, text *ws)
{
char *query = text_to_cstring(txt);
TSVectorStat *stat;
bool isnull;
Portal portal;
SPIPlanPtr plan;
if ((plan = SPI_prepare(query, 0, NULL)) == NULL)
/* internal error */
elog(ERROR, "SPI_prepare(\"%s\") failed", query);
if ((portal = SPI_cursor_open(NULL, plan, NULL, NULL, true)) == NULL)
/* internal error */
elog(ERROR, "SPI_cursor_open(\"%s\") failed", query);
SPI_cursor_fetch(portal, true, 100);
if (SPI_tuptable == NULL ||
SPI_tuptable->tupdesc->natts != 1 ||
!IsBinaryCoercible(SPI_gettypeid(SPI_tuptable->tupdesc, 1),
TSVECTOROID))
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("ts_stat query must return one tsvector column")));
stat = MemoryContextAllocZero(persistentContext, sizeof(TSVectorStat));
stat->maxdepth = 1;
if (ws)
{
char *buf;
buf = VARDATA_ANY(ws);
while (buf - VARDATA_ANY(ws) < VARSIZE_ANY_EXHDR(ws))
{
if (pg_mblen(buf) == 1)
{
switch (*buf)
{
case 'A':
case 'a':
stat->weight |= 1 << 3;
break;
case 'B':
case 'b':
stat->weight |= 1 << 2;
break;
case 'C':
case 'c':
stat->weight |= 1 << 1;
break;
case 'D':
case 'd':
stat->weight |= 1;
break;
default:
stat->weight |= 0;
}
}
buf += pg_mblen(buf);
}
}
while (SPI_processed > 0)
{
uint64 i;
for (i = 0; i < SPI_processed; i++)
{
Datum data = SPI_getbinval(SPI_tuptable->vals[i], SPI_tuptable->tupdesc, 1, &isnull);
if (!isnull)
stat = ts_accum(persistentContext, stat, data);
}
SPI_freetuptable(SPI_tuptable);
SPI_cursor_fetch(portal, true, 100);
}
SPI_freetuptable(SPI_tuptable);
SPI_cursor_close(portal);
SPI_freeplan(plan);
pfree(query);
return stat;
}
Datum
ts_stat1(PG_FUNCTION_ARGS)
{
FuncCallContext *funcctx;
Datum result;
if (SRF_IS_FIRSTCALL())
{
TSVectorStat *stat;
text *txt = PG_GETARG_TEXT_PP(0);
funcctx = SRF_FIRSTCALL_INIT();
SPI_connect();
stat = ts_stat_sql(funcctx->multi_call_memory_ctx, txt, NULL);
PG_FREE_IF_COPY(txt, 0);
ts_setup_firstcall(fcinfo, funcctx, stat);
SPI_finish();
}
funcctx = SRF_PERCALL_SETUP();
if ((result = ts_process_call(funcctx)) != (Datum) 0)
SRF_RETURN_NEXT(funcctx, result);
SRF_RETURN_DONE(funcctx);
}
Datum
ts_stat2(PG_FUNCTION_ARGS)
{
FuncCallContext *funcctx;
Datum result;
if (SRF_IS_FIRSTCALL())
{
TSVectorStat *stat;
text *txt = PG_GETARG_TEXT_PP(0);
text *ws = PG_GETARG_TEXT_PP(1);
funcctx = SRF_FIRSTCALL_INIT();
SPI_connect();
stat = ts_stat_sql(funcctx->multi_call_memory_ctx, txt, ws);
PG_FREE_IF_COPY(txt, 0);
PG_FREE_IF_COPY(ws, 1);
ts_setup_firstcall(fcinfo, funcctx, stat);
SPI_finish();
}
funcctx = SRF_PERCALL_SETUP();
if ((result = ts_process_call(funcctx)) != (Datum) 0)
SRF_RETURN_NEXT(funcctx, result);
SRF_RETURN_DONE(funcctx);
}
/*
* Triggers for automatic update of a tsvector column from text column(s)
*
* Trigger arguments are either
* name of tsvector col, name of tsconfig to use, name(s) of text col(s)
* name of tsvector col, name of regconfig col, name(s) of text col(s)
* ie, tsconfig can either be specified by name, or indirectly as the
* contents of a regconfig field in the row. If the name is used, it must
* be explicitly schema-qualified.
*/
Datum
tsvector_update_trigger_byid(PG_FUNCTION_ARGS)
{
return tsvector_update_trigger(fcinfo, false);
}
Datum
tsvector_update_trigger_bycolumn(PG_FUNCTION_ARGS)
{
return tsvector_update_trigger(fcinfo, true);
}
static Datum
tsvector_update_trigger(PG_FUNCTION_ARGS, bool config_column)
{
TriggerData *trigdata;
Trigger *trigger;
Relation rel;
HeapTuple rettuple = NULL;
int tsvector_attr_num,
i;
ParsedText prs;
Datum datum;
bool isnull;
text *txt;
Oid cfgId;
bool update_needed;
/* Check call context */
if (!CALLED_AS_TRIGGER(fcinfo)) /* internal error */
elog(ERROR, "tsvector_update_trigger: not fired by trigger manager");
trigdata = (TriggerData *) fcinfo->context;
if (!TRIGGER_FIRED_FOR_ROW(trigdata->tg_event))
elog(ERROR, "tsvector_update_trigger: must be fired for row");
if (!TRIGGER_FIRED_BEFORE(trigdata->tg_event))
elog(ERROR, "tsvector_update_trigger: must be fired BEFORE event");
if (TRIGGER_FIRED_BY_INSERT(trigdata->tg_event))
{
rettuple = trigdata->tg_trigtuple;
update_needed = true;
}
else if (TRIGGER_FIRED_BY_UPDATE(trigdata->tg_event))
{
rettuple = trigdata->tg_newtuple;
update_needed = false; /* computed below */
}
else
elog(ERROR, "tsvector_update_trigger: must be fired for INSERT or UPDATE");
trigger = trigdata->tg_trigger;
rel = trigdata->tg_relation;
if (trigger->tgnargs < 3)
elog(ERROR, "tsvector_update_trigger: arguments must be tsvector_field, ts_config, text_field1, ...)");
/* Find the target tsvector column */
tsvector_attr_num = SPI_fnumber(rel->rd_att, trigger->tgargs[0]);
if (tsvector_attr_num == SPI_ERROR_NOATTRIBUTE)
ereport(ERROR,
(errcode(ERRCODE_UNDEFINED_COLUMN),
errmsg("tsvector column \"%s\" does not exist",
trigger->tgargs[0])));
/* This will effectively reject system columns, so no separate test: */
if (!IsBinaryCoercible(SPI_gettypeid(rel->rd_att, tsvector_attr_num),
TSVECTOROID))
ereport(ERROR,
(errcode(ERRCODE_DATATYPE_MISMATCH),
errmsg("column \"%s\" is not of tsvector type",
trigger->tgargs[0])));
/* Find the configuration to use */
if (config_column)
{
int config_attr_num;
config_attr_num = SPI_fnumber(rel->rd_att, trigger->tgargs[1]);
if (config_attr_num == SPI_ERROR_NOATTRIBUTE)
ereport(ERROR,
(errcode(ERRCODE_UNDEFINED_COLUMN),
errmsg("configuration column \"%s\" does not exist",
trigger->tgargs[1])));
if (!IsBinaryCoercible(SPI_gettypeid(rel->rd_att, config_attr_num),
REGCONFIGOID))
ereport(ERROR,
(errcode(ERRCODE_DATATYPE_MISMATCH),
errmsg("column \"%s\" is not of regconfig type",
trigger->tgargs[1])));
datum = SPI_getbinval(rettuple, rel->rd_att, config_attr_num, &isnull);
if (isnull)
ereport(ERROR,
(errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
errmsg("configuration column \"%s\" must not be null",
trigger->tgargs[1])));
cfgId = DatumGetObjectId(datum);
}
else
{
List *names;
names = stringToQualifiedNameList(trigger->tgargs[1], NULL);
/* require a schema so that results are not search path dependent */
if (list_length(names) < 2)
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("text search configuration name \"%s\" must be schema-qualified",
trigger->tgargs[1])));
cfgId = get_ts_config_oid(names, false);
}
/* initialize parse state */
prs.lenwords = 32;
prs.curwords = 0;
prs.pos = 0;
prs.words = (ParsedWord *) palloc(sizeof(ParsedWord) * prs.lenwords);
/* find all words in indexable column(s) */
for (i = 2; i < trigger->tgnargs; i++)
{
int numattr;
numattr = SPI_fnumber(rel->rd_att, trigger->tgargs[i]);
if (numattr == SPI_ERROR_NOATTRIBUTE)
ereport(ERROR,
(errcode(ERRCODE_UNDEFINED_COLUMN),
errmsg("column \"%s\" does not exist",
trigger->tgargs[i])));
if (!IsBinaryCoercible(SPI_gettypeid(rel->rd_att, numattr), TEXTOID))
ereport(ERROR,
(errcode(ERRCODE_DATATYPE_MISMATCH),
errmsg("column \"%s\" is not of a character type",
trigger->tgargs[i])));
if (bms_is_member(numattr - FirstLowInvalidHeapAttributeNumber, trigdata->tg_updatedcols))
update_needed = true;
datum = SPI_getbinval(rettuple, rel->rd_att, numattr, &isnull);
if (isnull)
continue;
txt = DatumGetTextPP(datum);
parsetext(cfgId, &prs, VARDATA_ANY(txt), VARSIZE_ANY_EXHDR(txt));
if (txt != (text *) DatumGetPointer(datum))
pfree(txt);
}
if (update_needed)
{
/* make tsvector value */
datum = TSVectorGetDatum(make_tsvector(&prs));
isnull = false;
/* and insert it into tuple */
rettuple = heap_modify_tuple_by_cols(rettuple, rel->rd_att,
1, &tsvector_attr_num,
&datum, &isnull);
pfree(DatumGetPointer(datum));
}
return PointerGetDatum(rettuple);
}