Tsvector editing functions

Adds several tsvector editting function: convert tsvector to/from text array, set weight for given lexemes, delete lexeme(s), unnest, filter lexemes with given weights Author: Stas Kelvich with some editorization by me Reviewers: Tomas Vondram, Teodor Sigaev
2025-07-08 11:42:09 +03:00 · 2016-03-11 19:22:36 +03:00
parent 49635d7b3e
commit 6943a946c7
7 changed files with 933 additions and 18 deletions
--- a/src/backend/utils/adt/tsvector_op.c
+++ b/src/backend/utils/adt/tsvector_op.c
@ -14,6 +14,7 @@

 #include "postgres.h"

+#include "access/htup_details.h"
 #include "catalog/namespace.h"
 #include "catalog/pg_type.h"
 #include "commands/trigger.h"
@ -65,6 +66,7 @@ typedef struct
 #define STATHDRSIZE (offsetof(TSVectorStat, data))

 static Datum tsvector_update_trigger(PG_FUNCTION_ARGS, bool config_column);
+static int tsvector_bsearch(TSVector tsin, char *lexin, int lexin_len);

 /*
 * Order: haspos, len, word, for all positions (pos, weight)
@ -251,6 +253,90 @@ tsvector_setweight(PG_FUNCTION_ARGS)
 	PG_RETURN_POINTER(out);
 }

+/*
+ * setweight(tsin tsvector, char_weight "char", lexemes "text"[])
+ *
+ * Assign weight w to elements of tsin that are listed in lexemes.
+ */
+Datum
+tsvector_setweight_by_filter(PG_FUNCTION_ARGS)
+{
+	TSVector	tsin = PG_GETARG_TSVECTOR(0);
+	char		char_weight = PG_GETARG_CHAR(1);
+	ArrayType  *lexemes = PG_GETARG_ARRAYTYPE_P(2);
+
+	TSVector	tsout;
+	int			i,
+				j,
+				nlexemes,
+				weight;
+	WordEntry  *entry;
+	Datum	   *dlexemes;
+	bool	   *nulls;
+
+	switch (char_weight)
+	{
+		case 'A': case 'a':
+			weight = 3;
+			break;
+		case 'B': case 'b':
+			weight = 2;
+			break;
+		case 'C': case 'c':
+			weight = 1;
+			break;
+		case 'D': case 'd':
+			weight = 0;
+			break;
+		default:
+			/* internal error */
+			elog(ERROR, "unrecognized weight: %c", char_weight);
+	}
+
+	tsout = (TSVector) palloc(VARSIZE(tsin));
+	memcpy(tsout, tsin, VARSIZE(tsin));
+	entry = ARRPTR(tsout);
+
+	deconstruct_array(lexemes, TEXTOID, -1, false, 'i',
+					  &dlexemes, &nulls, &nlexemes);
+
+	/*
+	 * Assuming that lexemes array is significantly shorter than tsvector
+	 * we can iterate through lexemes performing binary search
+	 * of each lexeme from lexemes in tsvector.
+	 */
+	for (i = 0; i < nlexemes; i++)
+	{
+		char   *lex;
+		int		lex_len,
+				lex_pos;
+
+		if (nulls[i])
+			ereport(ERROR,
+					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+					 errmsg("lexeme array may not contain nulls")));
+
+		lex = VARDATA(dlexemes[i]);
+		lex_len = VARSIZE_ANY_EXHDR(dlexemes[i]);
+		lex_pos = tsvector_bsearch(tsout, lex, lex_len);
+
+		if (lex_pos >= 0 && (j = POSDATALEN(tsout, entry + lex_pos)) != 0)
+		{
+			WordEntryPos *p = POSDATAPTR(tsout, entry + lex_pos);
+			while (j--)
+			{
+				WEP_SETWEIGHT(*p, weight);
+				p++;
+			}
+		}
+	}
+
+	PG_FREE_IF_COPY(tsin, 0);
+	PG_FREE_IF_COPY(lexemes, 2);
+
+	PG_RETURN_POINTER(tsout);
+}
+
 #define compareEntry(pa, a, pb, b) \
 	tsCompareString((pa) + (a)->pos, (a)->len,	\
 					(pb) + (b)->pos, (b)->len,	\
@ -291,6 +377,483 @@ add_pos(TSVector src, WordEntry *srcptr,
 	return *clen - startlen;
 }

+/*
+ * Perform binary search of given lexeme in TSVector.
+ * Returns lexeme position in TSVector's entry array or -1 if lexeme wasn't
+ * found.
+ */
+static int
+tsvector_bsearch(const TSVector tsv, char *lexeme, int lexeme_len)
+{
+	WordEntry  *arrin = ARRPTR(tsv);
+	int			StopLow = 0,
+				StopHigh = tsv->size,
+				StopMiddle,
+				cmp;
+
+	while (StopLow < StopHigh)
+	{
+		StopMiddle = (StopLow + StopHigh)/2;
+
+		cmp = tsCompareString(lexeme, lexeme_len,
+			STRPTR(tsv) + arrin[StopMiddle].pos,
+			arrin[StopMiddle].len,
+			false);
+
+		if (cmp < 0)
+			StopHigh = StopMiddle;
+		else if (cmp > 0)
+			StopLow = StopMiddle + 1;
+		else /* found it */
+			return StopMiddle;
+	}
+
+	return -1;
+}
+
+static int
+compareint(const void *va, const void *vb)
+{
+	int32		a = *((const int32 *) va);
+	int32		b = *((const int32 *) vb);
+
+	if (a == b)
+		return 0;
+	return (a > b) ? 1 : -1;
+}
+
+/*
+ * Internal routine to delete lexemes from TSVector by array of offsets.
+ *
+ * int *indices_to_delete -- array of lexeme offsets to delete
+ * int indices_count -- size of that array
+ *
+ * Returns new TSVector without given lexemes along with their positions
+ * and weights.
+ */
+static TSVector
+tsvector_delete_by_indices(TSVector tsv, int *indices_to_delete,
+						   int indices_count)
+{
+	TSVector	tsout;
+	WordEntry  *arrin = ARRPTR(tsv),
+			   *arrout;
+	char	   *data = STRPTR(tsv),
+			   *dataout;
+	int			i, j, k,
+				curoff;
+
+	/*
+	 * Here we overestimates tsout size, since we don't know exact size
+	 * occupied by positions and weights. We will set exact size later
+	 * after a pass through TSVector.
+	 */
+	tsout = (TSVector) palloc0(VARSIZE(tsv));
+	arrout = ARRPTR(tsout);
+	tsout->size = tsv->size - indices_count;
+
+	/* Sort our filter array to simplify membership check later. */
+	if (indices_count > 1)
+		qsort(indices_to_delete, indices_count, sizeof(int), compareint);
+
+	/*
+	 * Copy tsv to tsout skipping lexemes that enlisted in indices_to_delete.
+	 */
+	curoff = 0;
+	dataout = STRPTR(tsout);
+	for (i = j = k = 0; i < tsv->size; i++)
+	{
+		/*
+		 * Here we should check whether current i is present in
+		 * indices_to_delete or not. Since indices_to_delete is already
+		 * sorted we can advance it index only when we have match.
+		 */
+		if (k < indices_count && i == indices_to_delete[k]){
+			k++;
+			continue;
+		}
+
+		/* Copy lexeme, it's positions and weights */
+		memcpy(dataout + curoff, data + arrin[i].pos, arrin[i].len);
+		arrout[j].haspos = arrin[i].haspos;
+		arrout[j].len = arrin[i].len;
+		arrout[j].pos = curoff;
+		curoff += arrin[i].len;
+		if (arrin[i].haspos)
+		{
+			int len = POSDATALEN(tsv, arrin+i) * sizeof(WordEntryPos) +
+					  sizeof(uint16);
+			curoff = SHORTALIGN(curoff);
+			memcpy(dataout + curoff,
+				   STRPTR(tsv) + SHORTALIGN(arrin[i].pos + arrin[i].len),
+				   len);
+			curoff += len;
+		}
+
+		j++;
+	}
+
+	/*
+	 * After the pass through TSVector k should equals exactly to indices_count.
+	 * If it isn't then the caller provided us with indices outside of
+	 * [0, tsv->size) range and estimation of tsout's size is wrong.
+	 */
+	Assert(k == indices_count);
+
+	SET_VARSIZE(tsout, CALCDATASIZE(tsout->size, curoff));
+	return tsout;
+}
+
+/*
+ * Delete given lexeme from tsvector.
+ * Implementation of user-level delete(tsvector, text).
+ */
+Datum
+tsvector_delete_str(PG_FUNCTION_ARGS)
+{
+	TSVector	tsin = PG_GETARG_TSVECTOR(0),
+				tsout;
+	text	   *tlexeme = PG_GETARG_TEXT_P(1);
+	char	   *lexeme = VARDATA(tlexeme);
+	int			lexeme_len = VARSIZE_ANY_EXHDR(tlexeme),
+				skip_index;
+
+	if ((skip_index = tsvector_bsearch(tsin, lexeme, lexeme_len)) == -1)
+		PG_RETURN_POINTER(tsin);
+
+	tsout = tsvector_delete_by_indices(tsin, &skip_index, 1);
+
+	PG_FREE_IF_COPY(tsin, 0);
+	PG_FREE_IF_COPY(tlexeme, 1);
+	PG_RETURN_POINTER(tsout);
+}
+
+/*
+ * Delete given array of lexemes from tsvector.
+ * Implementation of user-level delete(tsvector, text[]).
+ */
+Datum
+tsvector_delete_arr(PG_FUNCTION_ARGS)
+{
+	TSVector	tsin = PG_GETARG_TSVECTOR(0),
+				tsout;
+	ArrayType  *lexemes = PG_GETARG_ARRAYTYPE_P(1);
+	int			i, nlex,
+				skip_count,
+			   *skip_indices;
+	Datum	   *dlexemes;
+	bool	   *nulls;
+
+	deconstruct_array(lexemes, TEXTOID, -1, false, 'i',
+					  &dlexemes, &nulls, &nlex);
+
+	/*
+	 * In typical use case array of lexemes to delete is relatively small.
+	 * So here we optimizing things for that scenario: iterate through lexarr
+	 * performing binary search of each lexeme from lexarr in tsvector.
+	 */
+	skip_indices = palloc0(nlex * sizeof(int));
+	for (i = skip_count = 0; i < nlex; i++)
+	{
+		char *lex;
+		int lex_len,
+			lex_pos;
+
+		if (nulls[i])
+			ereport(ERROR,
+					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+					 errmsg("lexeme array may not contain nulls")));
+
+		lex = VARDATA(dlexemes[i]);
+		lex_len = VARSIZE_ANY_EXHDR(dlexemes[i]);
+		lex_pos = tsvector_bsearch(tsin, lex, lex_len);
+
+		if (lex_pos >= 0)
+			skip_indices[skip_count++] = lex_pos;
+	}
+
+	tsout = tsvector_delete_by_indices(tsin, skip_indices, skip_count);
+
+	pfree(skip_indices);
+	PG_FREE_IF_COPY(tsin, 0);
+	PG_FREE_IF_COPY(lexemes, 1);
+
+	PG_RETURN_POINTER(tsout);
+}
+
+/*
+ * Expand tsvector as table with following columns:
+ *     lexeme: lexeme text
+ *     positions: integer array of lexeme positions
+ *     weights: char array of weights corresponding to positions
+ */
+Datum
+tsvector_unnest(PG_FUNCTION_ARGS)
+{
+	FuncCallContext	   *funcctx;
+	TSVector			tsin;
+
+	if (SRF_IS_FIRSTCALL())
+	{
+		MemoryContext oldcontext;
+		TupleDesc	tupdesc;
+
+		funcctx = SRF_FIRSTCALL_INIT();
+		oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
+
+		tupdesc = CreateTemplateTupleDesc(3, false);
+		TupleDescInitEntry(tupdesc, (AttrNumber) 1, "lexeme",
+						   TEXTOID, -1, 0);
+		TupleDescInitEntry(tupdesc, (AttrNumber) 2, "positions",
+						   INT2ARRAYOID, -1, 0);
+		TupleDescInitEntry(tupdesc, (AttrNumber) 3, "weights",
+						   TEXTARRAYOID, -1, 0);
+		funcctx->tuple_desc = BlessTupleDesc(tupdesc);
+
+		funcctx->user_fctx = PG_GETARG_TSVECTOR_COPY(0);
+
+		MemoryContextSwitchTo(oldcontext);
+	}
+
+	funcctx = SRF_PERCALL_SETUP();
+	tsin = (TSVector) funcctx->user_fctx;
+
+	if (funcctx->call_cntr < tsin->size)
+	{
+		WordEntry  *arrin = ARRPTR(tsin);
+		char	   *data = STRPTR(tsin);
+		HeapTuple	tuple;
+		int			j,
+					i = funcctx->call_cntr;
+		bool		nulls[] = {false, false, false};
+		Datum		values[3];
+
+		values[0] = PointerGetDatum(
+				cstring_to_text_with_len(data + arrin[i].pos, arrin[i].len)
+		);
+
+		if (arrin[i].haspos)
+		{
+			WordEntryPosVector *posv;
+			Datum	   *positions;
+			Datum	   *weights;
+			char		weight;
+
+			/*
+			 * Internally tsvector stores position and weight in the same
+			 * uint16 (2 bits for weight, 14 for position). Here we extract that
+			 * in two separate arrays.
+			 */
+			posv = _POSVECPTR(tsin, arrin + i);
+			positions = palloc(posv->npos * sizeof(Datum));
+			weights   = palloc(posv->npos * sizeof(Datum));
+			for (j = 0; j < posv->npos; j++)
+			{
+				positions[j] = Int16GetDatum(WEP_GETPOS(posv->pos[j]));
+				weight = 'D' - WEP_GETWEIGHT(posv->pos[j]);
+				weights[j] = PointerGetDatum(
+									cstring_to_text_with_len(&weight, 1)
+							);
+			}
+
+			values[1] = PointerGetDatum(
+				construct_array(positions, posv->npos, INT2OID, 2, true, 's'));
+			values[2] = PointerGetDatum(
+				construct_array(weights, posv->npos, TEXTOID, -1, false, 'i'));
+		}
+		else
+		{
+			nulls[1] = nulls[2] = true;
+		}
+
+		tuple = heap_form_tuple(funcctx->tuple_desc, values, nulls);
+		SRF_RETURN_NEXT(funcctx, HeapTupleGetDatum(tuple));
+	}
+	else
+	{
+		pfree(tsin);
+		SRF_RETURN_DONE(funcctx);
+	}
+}
+
+/*
+ * Convert tsvector to array of lexemes.
+ */
+Datum
+tsvector_to_array(PG_FUNCTION_ARGS)
+{
+	TSVector			tsin  = PG_GETARG_TSVECTOR(0);
+	WordEntry		   *arrin = ARRPTR(tsin);
+	Datum				elements[tsin->size];
+	int					i;
+	ArrayType		   *array;
+
+	for (i = 0; i < tsin->size; i++)
+	{
+		elements[i] = PointerGetDatum(
+			cstring_to_text_with_len(STRPTR(tsin) + arrin[i].pos, arrin[i].len)
+		);
+	}
+
+	array = construct_array(elements, tsin->size, TEXTOID, -1, false, 'i');
+	PG_FREE_IF_COPY(tsin, 0);
+	PG_RETURN_POINTER(array);
+}
+
+/*
+ * Build tsvector from array of lexemes.
+ */
+Datum
+array_to_tsvector(PG_FUNCTION_ARGS)
+{
+	ArrayType  *v = PG_GETARG_ARRAYTYPE_P(0);
+	TSVector	tsout;
+	Datum	   *dlexemes;
+	WordEntry  *arrout;
+	bool	   *nulls;
+	int			nitems,
+				i,
+				tslen,
+				datalen = 0;
+	char	   *cur;
+
+	deconstruct_array(v, TEXTOID, -1, false, 'i', &dlexemes, &nulls, &nitems);
+
+	for (i = 0; i < nitems; i++)
+	{
+		if (nulls[i])
+			ereport(ERROR,
+					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+					 errmsg("lexeme array may not contain nulls")));
+
+		datalen += VARSIZE_ANY_EXHDR(dlexemes[i]);
+	}
+
+	tslen = CALCDATASIZE(nitems, datalen);
+	tsout = (TSVector) palloc0(tslen);
+	SET_VARSIZE(tsout, tslen);
+	tsout->size = nitems;
+	arrout = ARRPTR(tsout);
+	cur = STRPTR(tsout);
+
+	for (i = 0; i < nitems; i++)
+	{
+		char *lex = VARDATA(dlexemes[i]);
+		int lex_len = VARSIZE_ANY_EXHDR(dlexemes[i]);
+
+		memcpy(cur, lex, lex_len);
+		arrout[i].haspos = 0;
+		arrout[i].len = lex_len;
+		arrout[i].pos = cur - STRPTR(tsout);
+		cur += lex_len;
+	}
+
+	PG_FREE_IF_COPY(v, 0);
+	PG_RETURN_POINTER(tsout);
+}
+
+/*
+ * Leave only elements with given weights from tsvector.
+ */
+Datum
+tsvector_filter(PG_FUNCTION_ARGS)
+{
+	TSVector	tsin = PG_GETARG_TSVECTOR(0),
+				tsout;
+	ArrayType  *weights = PG_GETARG_ARRAYTYPE_P(1);
+	WordEntry  *arrin = ARRPTR(tsin),
+			   *arrout;
+	char	   *datain = STRPTR(tsin),
+			   *dataout;
+	Datum	   *dweights;
+	bool	   *nulls;
+	int			nweigths;
+	int			i, j;
+	char		mask = 0,
+				cur_pos = 0;
+
+	deconstruct_array(weights, CHAROID, 1, true, 'c',
+					  &dweights, &nulls, &nweigths);
+
+	for (i = 0; i < nweigths; i++)
+	{
+		char char_weight;
+
+		if (nulls[i])
+			ereport(ERROR,
+					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+					 errmsg("weight array may not contain nulls")));
+
+		char_weight = DatumGetChar(dweights[i]);
+		switch (char_weight)
+		{
+			case 'A': case 'a':
+				mask = mask | 8;
+				break;
+			case 'B': case 'b':
+				mask = mask | 4;
+				break;
+			case 'C': case 'c':
+				mask = mask | 2;
+				break;
+			case 'D': case 'd':
+				mask = mask | 1;
+				break;
+			default:
+				/* internal error */
+				elog(ERROR, "unrecognized weight: %c", char_weight);
+		}
+	}
+
+	tsout = (TSVector) palloc0(VARSIZE(tsin));
+	tsout->size = tsin->size;
+	arrout = ARRPTR(tsout);
+	dataout = STRPTR(tsout);
+
+	for (i = j = 0; i < tsin->size; i++)
+	{
+		WordEntryPosVector *posvin,
+						   *posvout;
+		int npos = 0;
+		int k;
+
+		if (!arrin[i].haspos)
+			continue;
+
+		posvin  = _POSVECPTR(tsin, arrin + i);
+		posvout = (WordEntryPosVector *)
+						(dataout + SHORTALIGN(cur_pos + arrin[i].len));
+
+		for (k = 0; k < posvin->npos; k++)
+		{
+			if (mask & (1 << WEP_GETWEIGHT(posvin->pos[k])))
+				posvout->pos[npos++] = posvin->pos[k];
+		}
+
+		if (!npos) /* no satisfactory positions found, so skip that lexeme */
+			continue;
+
+		arrout[j].haspos = true;
+		arrout[j].len = arrin[i].len;
+		arrout[j].pos = cur_pos;
+
+		memcpy(dataout + cur_pos, datain + arrin[i].pos, arrin[i].len);
+		posvout->npos = npos;
+		cur_pos += SHORTALIGN(arrin[i].len);
+		cur_pos += POSDATALEN(tsout, arrout+j) * sizeof(WordEntryPos) +
+				   sizeof(uint16);
+		j++;
+	}
+
+	tsout->size = j;
+	if (dataout != STRPTR(tsout))
+		memmove(STRPTR(tsout), dataout, cur_pos);
+
+	SET_VARSIZE(tsout, CALCDATASIZE(tsout->size, cur_pos));
+
+	PG_FREE_IF_COPY(tsin, 0);
+	PG_RETURN_POINTER(tsout);
+}

 Datum
 tsvector_concat(PG_FUNCTION_ARGS)