mirror of
https://github.com/postgres/postgres.git
synced 2025-07-08 11:42:09 +03:00
Tsvector editing functions
Adds several tsvector editting function: convert tsvector to/from text array, set weight for given lexemes, delete lexeme(s), unnest, filter lexemes with given weights Author: Stas Kelvich with some editorization by me Reviewers: Tomas Vondram, Teodor Sigaev
This commit is contained in:
@ -14,6 +14,7 @@
|
||||
|
||||
#include "postgres.h"
|
||||
|
||||
#include "access/htup_details.h"
|
||||
#include "catalog/namespace.h"
|
||||
#include "catalog/pg_type.h"
|
||||
#include "commands/trigger.h"
|
||||
@ -65,6 +66,7 @@ typedef struct
|
||||
#define STATHDRSIZE (offsetof(TSVectorStat, data))
|
||||
|
||||
static Datum tsvector_update_trigger(PG_FUNCTION_ARGS, bool config_column);
|
||||
static int tsvector_bsearch(TSVector tsin, char *lexin, int lexin_len);
|
||||
|
||||
/*
|
||||
* Order: haspos, len, word, for all positions (pos, weight)
|
||||
@ -251,6 +253,90 @@ tsvector_setweight(PG_FUNCTION_ARGS)
|
||||
PG_RETURN_POINTER(out);
|
||||
}
|
||||
|
||||
/*
|
||||
* setweight(tsin tsvector, char_weight "char", lexemes "text"[])
|
||||
*
|
||||
* Assign weight w to elements of tsin that are listed in lexemes.
|
||||
*/
|
||||
Datum
|
||||
tsvector_setweight_by_filter(PG_FUNCTION_ARGS)
|
||||
{
|
||||
TSVector tsin = PG_GETARG_TSVECTOR(0);
|
||||
char char_weight = PG_GETARG_CHAR(1);
|
||||
ArrayType *lexemes = PG_GETARG_ARRAYTYPE_P(2);
|
||||
|
||||
TSVector tsout;
|
||||
int i,
|
||||
j,
|
||||
nlexemes,
|
||||
weight;
|
||||
WordEntry *entry;
|
||||
Datum *dlexemes;
|
||||
bool *nulls;
|
||||
|
||||
switch (char_weight)
|
||||
{
|
||||
case 'A': case 'a':
|
||||
weight = 3;
|
||||
break;
|
||||
case 'B': case 'b':
|
||||
weight = 2;
|
||||
break;
|
||||
case 'C': case 'c':
|
||||
weight = 1;
|
||||
break;
|
||||
case 'D': case 'd':
|
||||
weight = 0;
|
||||
break;
|
||||
default:
|
||||
/* internal error */
|
||||
elog(ERROR, "unrecognized weight: %c", char_weight);
|
||||
}
|
||||
|
||||
tsout = (TSVector) palloc(VARSIZE(tsin));
|
||||
memcpy(tsout, tsin, VARSIZE(tsin));
|
||||
entry = ARRPTR(tsout);
|
||||
|
||||
deconstruct_array(lexemes, TEXTOID, -1, false, 'i',
|
||||
&dlexemes, &nulls, &nlexemes);
|
||||
|
||||
/*
|
||||
* Assuming that lexemes array is significantly shorter than tsvector
|
||||
* we can iterate through lexemes performing binary search
|
||||
* of each lexeme from lexemes in tsvector.
|
||||
*/
|
||||
for (i = 0; i < nlexemes; i++)
|
||||
{
|
||||
char *lex;
|
||||
int lex_len,
|
||||
lex_pos;
|
||||
|
||||
if (nulls[i])
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
|
||||
errmsg("lexeme array may not contain nulls")));
|
||||
|
||||
lex = VARDATA(dlexemes[i]);
|
||||
lex_len = VARSIZE_ANY_EXHDR(dlexemes[i]);
|
||||
lex_pos = tsvector_bsearch(tsout, lex, lex_len);
|
||||
|
||||
if (lex_pos >= 0 && (j = POSDATALEN(tsout, entry + lex_pos)) != 0)
|
||||
{
|
||||
WordEntryPos *p = POSDATAPTR(tsout, entry + lex_pos);
|
||||
while (j--)
|
||||
{
|
||||
WEP_SETWEIGHT(*p, weight);
|
||||
p++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
PG_FREE_IF_COPY(tsin, 0);
|
||||
PG_FREE_IF_COPY(lexemes, 2);
|
||||
|
||||
PG_RETURN_POINTER(tsout);
|
||||
}
|
||||
|
||||
#define compareEntry(pa, a, pb, b) \
|
||||
tsCompareString((pa) + (a)->pos, (a)->len, \
|
||||
(pb) + (b)->pos, (b)->len, \
|
||||
@ -291,6 +377,483 @@ add_pos(TSVector src, WordEntry *srcptr,
|
||||
return *clen - startlen;
|
||||
}
|
||||
|
||||
/*
|
||||
* Perform binary search of given lexeme in TSVector.
|
||||
* Returns lexeme position in TSVector's entry array or -1 if lexeme wasn't
|
||||
* found.
|
||||
*/
|
||||
static int
|
||||
tsvector_bsearch(const TSVector tsv, char *lexeme, int lexeme_len)
|
||||
{
|
||||
WordEntry *arrin = ARRPTR(tsv);
|
||||
int StopLow = 0,
|
||||
StopHigh = tsv->size,
|
||||
StopMiddle,
|
||||
cmp;
|
||||
|
||||
while (StopLow < StopHigh)
|
||||
{
|
||||
StopMiddle = (StopLow + StopHigh)/2;
|
||||
|
||||
cmp = tsCompareString(lexeme, lexeme_len,
|
||||
STRPTR(tsv) + arrin[StopMiddle].pos,
|
||||
arrin[StopMiddle].len,
|
||||
false);
|
||||
|
||||
if (cmp < 0)
|
||||
StopHigh = StopMiddle;
|
||||
else if (cmp > 0)
|
||||
StopLow = StopMiddle + 1;
|
||||
else /* found it */
|
||||
return StopMiddle;
|
||||
}
|
||||
|
||||
return -1;
|
||||
}
|
||||
|
||||
static int
|
||||
compareint(const void *va, const void *vb)
|
||||
{
|
||||
int32 a = *((const int32 *) va);
|
||||
int32 b = *((const int32 *) vb);
|
||||
|
||||
if (a == b)
|
||||
return 0;
|
||||
return (a > b) ? 1 : -1;
|
||||
}
|
||||
|
||||
/*
|
||||
* Internal routine to delete lexemes from TSVector by array of offsets.
|
||||
*
|
||||
* int *indices_to_delete -- array of lexeme offsets to delete
|
||||
* int indices_count -- size of that array
|
||||
*
|
||||
* Returns new TSVector without given lexemes along with their positions
|
||||
* and weights.
|
||||
*/
|
||||
static TSVector
|
||||
tsvector_delete_by_indices(TSVector tsv, int *indices_to_delete,
|
||||
int indices_count)
|
||||
{
|
||||
TSVector tsout;
|
||||
WordEntry *arrin = ARRPTR(tsv),
|
||||
*arrout;
|
||||
char *data = STRPTR(tsv),
|
||||
*dataout;
|
||||
int i, j, k,
|
||||
curoff;
|
||||
|
||||
/*
|
||||
* Here we overestimates tsout size, since we don't know exact size
|
||||
* occupied by positions and weights. We will set exact size later
|
||||
* after a pass through TSVector.
|
||||
*/
|
||||
tsout = (TSVector) palloc0(VARSIZE(tsv));
|
||||
arrout = ARRPTR(tsout);
|
||||
tsout->size = tsv->size - indices_count;
|
||||
|
||||
/* Sort our filter array to simplify membership check later. */
|
||||
if (indices_count > 1)
|
||||
qsort(indices_to_delete, indices_count, sizeof(int), compareint);
|
||||
|
||||
/*
|
||||
* Copy tsv to tsout skipping lexemes that enlisted in indices_to_delete.
|
||||
*/
|
||||
curoff = 0;
|
||||
dataout = STRPTR(tsout);
|
||||
for (i = j = k = 0; i < tsv->size; i++)
|
||||
{
|
||||
/*
|
||||
* Here we should check whether current i is present in
|
||||
* indices_to_delete or not. Since indices_to_delete is already
|
||||
* sorted we can advance it index only when we have match.
|
||||
*/
|
||||
if (k < indices_count && i == indices_to_delete[k]){
|
||||
k++;
|
||||
continue;
|
||||
}
|
||||
|
||||
/* Copy lexeme, it's positions and weights */
|
||||
memcpy(dataout + curoff, data + arrin[i].pos, arrin[i].len);
|
||||
arrout[j].haspos = arrin[i].haspos;
|
||||
arrout[j].len = arrin[i].len;
|
||||
arrout[j].pos = curoff;
|
||||
curoff += arrin[i].len;
|
||||
if (arrin[i].haspos)
|
||||
{
|
||||
int len = POSDATALEN(tsv, arrin+i) * sizeof(WordEntryPos) +
|
||||
sizeof(uint16);
|
||||
curoff = SHORTALIGN(curoff);
|
||||
memcpy(dataout + curoff,
|
||||
STRPTR(tsv) + SHORTALIGN(arrin[i].pos + arrin[i].len),
|
||||
len);
|
||||
curoff += len;
|
||||
}
|
||||
|
||||
j++;
|
||||
}
|
||||
|
||||
/*
|
||||
* After the pass through TSVector k should equals exactly to indices_count.
|
||||
* If it isn't then the caller provided us with indices outside of
|
||||
* [0, tsv->size) range and estimation of tsout's size is wrong.
|
||||
*/
|
||||
Assert(k == indices_count);
|
||||
|
||||
SET_VARSIZE(tsout, CALCDATASIZE(tsout->size, curoff));
|
||||
return tsout;
|
||||
}
|
||||
|
||||
/*
|
||||
* Delete given lexeme from tsvector.
|
||||
* Implementation of user-level delete(tsvector, text).
|
||||
*/
|
||||
Datum
|
||||
tsvector_delete_str(PG_FUNCTION_ARGS)
|
||||
{
|
||||
TSVector tsin = PG_GETARG_TSVECTOR(0),
|
||||
tsout;
|
||||
text *tlexeme = PG_GETARG_TEXT_P(1);
|
||||
char *lexeme = VARDATA(tlexeme);
|
||||
int lexeme_len = VARSIZE_ANY_EXHDR(tlexeme),
|
||||
skip_index;
|
||||
|
||||
if ((skip_index = tsvector_bsearch(tsin, lexeme, lexeme_len)) == -1)
|
||||
PG_RETURN_POINTER(tsin);
|
||||
|
||||
tsout = tsvector_delete_by_indices(tsin, &skip_index, 1);
|
||||
|
||||
PG_FREE_IF_COPY(tsin, 0);
|
||||
PG_FREE_IF_COPY(tlexeme, 1);
|
||||
PG_RETURN_POINTER(tsout);
|
||||
}
|
||||
|
||||
/*
|
||||
* Delete given array of lexemes from tsvector.
|
||||
* Implementation of user-level delete(tsvector, text[]).
|
||||
*/
|
||||
Datum
|
||||
tsvector_delete_arr(PG_FUNCTION_ARGS)
|
||||
{
|
||||
TSVector tsin = PG_GETARG_TSVECTOR(0),
|
||||
tsout;
|
||||
ArrayType *lexemes = PG_GETARG_ARRAYTYPE_P(1);
|
||||
int i, nlex,
|
||||
skip_count,
|
||||
*skip_indices;
|
||||
Datum *dlexemes;
|
||||
bool *nulls;
|
||||
|
||||
deconstruct_array(lexemes, TEXTOID, -1, false, 'i',
|
||||
&dlexemes, &nulls, &nlex);
|
||||
|
||||
/*
|
||||
* In typical use case array of lexemes to delete is relatively small.
|
||||
* So here we optimizing things for that scenario: iterate through lexarr
|
||||
* performing binary search of each lexeme from lexarr in tsvector.
|
||||
*/
|
||||
skip_indices = palloc0(nlex * sizeof(int));
|
||||
for (i = skip_count = 0; i < nlex; i++)
|
||||
{
|
||||
char *lex;
|
||||
int lex_len,
|
||||
lex_pos;
|
||||
|
||||
if (nulls[i])
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
|
||||
errmsg("lexeme array may not contain nulls")));
|
||||
|
||||
lex = VARDATA(dlexemes[i]);
|
||||
lex_len = VARSIZE_ANY_EXHDR(dlexemes[i]);
|
||||
lex_pos = tsvector_bsearch(tsin, lex, lex_len);
|
||||
|
||||
if (lex_pos >= 0)
|
||||
skip_indices[skip_count++] = lex_pos;
|
||||
}
|
||||
|
||||
tsout = tsvector_delete_by_indices(tsin, skip_indices, skip_count);
|
||||
|
||||
pfree(skip_indices);
|
||||
PG_FREE_IF_COPY(tsin, 0);
|
||||
PG_FREE_IF_COPY(lexemes, 1);
|
||||
|
||||
PG_RETURN_POINTER(tsout);
|
||||
}
|
||||
|
||||
/*
|
||||
* Expand tsvector as table with following columns:
|
||||
* lexeme: lexeme text
|
||||
* positions: integer array of lexeme positions
|
||||
* weights: char array of weights corresponding to positions
|
||||
*/
|
||||
Datum
|
||||
tsvector_unnest(PG_FUNCTION_ARGS)
|
||||
{
|
||||
FuncCallContext *funcctx;
|
||||
TSVector tsin;
|
||||
|
||||
if (SRF_IS_FIRSTCALL())
|
||||
{
|
||||
MemoryContext oldcontext;
|
||||
TupleDesc tupdesc;
|
||||
|
||||
funcctx = SRF_FIRSTCALL_INIT();
|
||||
oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
|
||||
|
||||
tupdesc = CreateTemplateTupleDesc(3, false);
|
||||
TupleDescInitEntry(tupdesc, (AttrNumber) 1, "lexeme",
|
||||
TEXTOID, -1, 0);
|
||||
TupleDescInitEntry(tupdesc, (AttrNumber) 2, "positions",
|
||||
INT2ARRAYOID, -1, 0);
|
||||
TupleDescInitEntry(tupdesc, (AttrNumber) 3, "weights",
|
||||
TEXTARRAYOID, -1, 0);
|
||||
funcctx->tuple_desc = BlessTupleDesc(tupdesc);
|
||||
|
||||
funcctx->user_fctx = PG_GETARG_TSVECTOR_COPY(0);
|
||||
|
||||
MemoryContextSwitchTo(oldcontext);
|
||||
}
|
||||
|
||||
funcctx = SRF_PERCALL_SETUP();
|
||||
tsin = (TSVector) funcctx->user_fctx;
|
||||
|
||||
if (funcctx->call_cntr < tsin->size)
|
||||
{
|
||||
WordEntry *arrin = ARRPTR(tsin);
|
||||
char *data = STRPTR(tsin);
|
||||
HeapTuple tuple;
|
||||
int j,
|
||||
i = funcctx->call_cntr;
|
||||
bool nulls[] = {false, false, false};
|
||||
Datum values[3];
|
||||
|
||||
values[0] = PointerGetDatum(
|
||||
cstring_to_text_with_len(data + arrin[i].pos, arrin[i].len)
|
||||
);
|
||||
|
||||
if (arrin[i].haspos)
|
||||
{
|
||||
WordEntryPosVector *posv;
|
||||
Datum *positions;
|
||||
Datum *weights;
|
||||
char weight;
|
||||
|
||||
/*
|
||||
* Internally tsvector stores position and weight in the same
|
||||
* uint16 (2 bits for weight, 14 for position). Here we extract that
|
||||
* in two separate arrays.
|
||||
*/
|
||||
posv = _POSVECPTR(tsin, arrin + i);
|
||||
positions = palloc(posv->npos * sizeof(Datum));
|
||||
weights = palloc(posv->npos * sizeof(Datum));
|
||||
for (j = 0; j < posv->npos; j++)
|
||||
{
|
||||
positions[j] = Int16GetDatum(WEP_GETPOS(posv->pos[j]));
|
||||
weight = 'D' - WEP_GETWEIGHT(posv->pos[j]);
|
||||
weights[j] = PointerGetDatum(
|
||||
cstring_to_text_with_len(&weight, 1)
|
||||
);
|
||||
}
|
||||
|
||||
values[1] = PointerGetDatum(
|
||||
construct_array(positions, posv->npos, INT2OID, 2, true, 's'));
|
||||
values[2] = PointerGetDatum(
|
||||
construct_array(weights, posv->npos, TEXTOID, -1, false, 'i'));
|
||||
}
|
||||
else
|
||||
{
|
||||
nulls[1] = nulls[2] = true;
|
||||
}
|
||||
|
||||
tuple = heap_form_tuple(funcctx->tuple_desc, values, nulls);
|
||||
SRF_RETURN_NEXT(funcctx, HeapTupleGetDatum(tuple));
|
||||
}
|
||||
else
|
||||
{
|
||||
pfree(tsin);
|
||||
SRF_RETURN_DONE(funcctx);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Convert tsvector to array of lexemes.
|
||||
*/
|
||||
Datum
|
||||
tsvector_to_array(PG_FUNCTION_ARGS)
|
||||
{
|
||||
TSVector tsin = PG_GETARG_TSVECTOR(0);
|
||||
WordEntry *arrin = ARRPTR(tsin);
|
||||
Datum elements[tsin->size];
|
||||
int i;
|
||||
ArrayType *array;
|
||||
|
||||
for (i = 0; i < tsin->size; i++)
|
||||
{
|
||||
elements[i] = PointerGetDatum(
|
||||
cstring_to_text_with_len(STRPTR(tsin) + arrin[i].pos, arrin[i].len)
|
||||
);
|
||||
}
|
||||
|
||||
array = construct_array(elements, tsin->size, TEXTOID, -1, false, 'i');
|
||||
PG_FREE_IF_COPY(tsin, 0);
|
||||
PG_RETURN_POINTER(array);
|
||||
}
|
||||
|
||||
/*
|
||||
* Build tsvector from array of lexemes.
|
||||
*/
|
||||
Datum
|
||||
array_to_tsvector(PG_FUNCTION_ARGS)
|
||||
{
|
||||
ArrayType *v = PG_GETARG_ARRAYTYPE_P(0);
|
||||
TSVector tsout;
|
||||
Datum *dlexemes;
|
||||
WordEntry *arrout;
|
||||
bool *nulls;
|
||||
int nitems,
|
||||
i,
|
||||
tslen,
|
||||
datalen = 0;
|
||||
char *cur;
|
||||
|
||||
deconstruct_array(v, TEXTOID, -1, false, 'i', &dlexemes, &nulls, &nitems);
|
||||
|
||||
for (i = 0; i < nitems; i++)
|
||||
{
|
||||
if (nulls[i])
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
|
||||
errmsg("lexeme array may not contain nulls")));
|
||||
|
||||
datalen += VARSIZE_ANY_EXHDR(dlexemes[i]);
|
||||
}
|
||||
|
||||
tslen = CALCDATASIZE(nitems, datalen);
|
||||
tsout = (TSVector) palloc0(tslen);
|
||||
SET_VARSIZE(tsout, tslen);
|
||||
tsout->size = nitems;
|
||||
arrout = ARRPTR(tsout);
|
||||
cur = STRPTR(tsout);
|
||||
|
||||
for (i = 0; i < nitems; i++)
|
||||
{
|
||||
char *lex = VARDATA(dlexemes[i]);
|
||||
int lex_len = VARSIZE_ANY_EXHDR(dlexemes[i]);
|
||||
|
||||
memcpy(cur, lex, lex_len);
|
||||
arrout[i].haspos = 0;
|
||||
arrout[i].len = lex_len;
|
||||
arrout[i].pos = cur - STRPTR(tsout);
|
||||
cur += lex_len;
|
||||
}
|
||||
|
||||
PG_FREE_IF_COPY(v, 0);
|
||||
PG_RETURN_POINTER(tsout);
|
||||
}
|
||||
|
||||
/*
|
||||
* Leave only elements with given weights from tsvector.
|
||||
*/
|
||||
Datum
|
||||
tsvector_filter(PG_FUNCTION_ARGS)
|
||||
{
|
||||
TSVector tsin = PG_GETARG_TSVECTOR(0),
|
||||
tsout;
|
||||
ArrayType *weights = PG_GETARG_ARRAYTYPE_P(1);
|
||||
WordEntry *arrin = ARRPTR(tsin),
|
||||
*arrout;
|
||||
char *datain = STRPTR(tsin),
|
||||
*dataout;
|
||||
Datum *dweights;
|
||||
bool *nulls;
|
||||
int nweigths;
|
||||
int i, j;
|
||||
char mask = 0,
|
||||
cur_pos = 0;
|
||||
|
||||
deconstruct_array(weights, CHAROID, 1, true, 'c',
|
||||
&dweights, &nulls, &nweigths);
|
||||
|
||||
for (i = 0; i < nweigths; i++)
|
||||
{
|
||||
char char_weight;
|
||||
|
||||
if (nulls[i])
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
|
||||
errmsg("weight array may not contain nulls")));
|
||||
|
||||
char_weight = DatumGetChar(dweights[i]);
|
||||
switch (char_weight)
|
||||
{
|
||||
case 'A': case 'a':
|
||||
mask = mask | 8;
|
||||
break;
|
||||
case 'B': case 'b':
|
||||
mask = mask | 4;
|
||||
break;
|
||||
case 'C': case 'c':
|
||||
mask = mask | 2;
|
||||
break;
|
||||
case 'D': case 'd':
|
||||
mask = mask | 1;
|
||||
break;
|
||||
default:
|
||||
/* internal error */
|
||||
elog(ERROR, "unrecognized weight: %c", char_weight);
|
||||
}
|
||||
}
|
||||
|
||||
tsout = (TSVector) palloc0(VARSIZE(tsin));
|
||||
tsout->size = tsin->size;
|
||||
arrout = ARRPTR(tsout);
|
||||
dataout = STRPTR(tsout);
|
||||
|
||||
for (i = j = 0; i < tsin->size; i++)
|
||||
{
|
||||
WordEntryPosVector *posvin,
|
||||
*posvout;
|
||||
int npos = 0;
|
||||
int k;
|
||||
|
||||
if (!arrin[i].haspos)
|
||||
continue;
|
||||
|
||||
posvin = _POSVECPTR(tsin, arrin + i);
|
||||
posvout = (WordEntryPosVector *)
|
||||
(dataout + SHORTALIGN(cur_pos + arrin[i].len));
|
||||
|
||||
for (k = 0; k < posvin->npos; k++)
|
||||
{
|
||||
if (mask & (1 << WEP_GETWEIGHT(posvin->pos[k])))
|
||||
posvout->pos[npos++] = posvin->pos[k];
|
||||
}
|
||||
|
||||
if (!npos) /* no satisfactory positions found, so skip that lexeme */
|
||||
continue;
|
||||
|
||||
arrout[j].haspos = true;
|
||||
arrout[j].len = arrin[i].len;
|
||||
arrout[j].pos = cur_pos;
|
||||
|
||||
memcpy(dataout + cur_pos, datain + arrin[i].pos, arrin[i].len);
|
||||
posvout->npos = npos;
|
||||
cur_pos += SHORTALIGN(arrin[i].len);
|
||||
cur_pos += POSDATALEN(tsout, arrout+j) * sizeof(WordEntryPos) +
|
||||
sizeof(uint16);
|
||||
j++;
|
||||
}
|
||||
|
||||
tsout->size = j;
|
||||
if (dataout != STRPTR(tsout))
|
||||
memmove(STRPTR(tsout), dataout, cur_pos);
|
||||
|
||||
SET_VARSIZE(tsout, CALCDATASIZE(tsout->size, cur_pos));
|
||||
|
||||
PG_FREE_IF_COPY(tsin, 0);
|
||||
PG_RETURN_POINTER(tsout);
|
||||
}
|
||||
|
||||
Datum
|
||||
tsvector_concat(PG_FUNCTION_ARGS)
|
||||
|
Reference in New Issue
Block a user