Extend GIN to support partial-match searches, and extend tsquery to support

prefix matching using this facility. Teodor Sigaev and Oleg Bartunov
2025-11-29 23:43:17 +03:00 · 2008-05-16 16:31:02 +00:00
parent e1bdd07c3c
commit e6dbcb72fa
32 changed files with 1284 additions and 508 deletions
--- a/src/backend/utils/adt/tsginidx.c
+++ b/src/backend/utils/adt/tsginidx.c
@@ -7,7 +7,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/utils/adt/tsginidx.c,v 1.11 2008/04/14 17:05:33 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/utils/adt/tsginidx.c,v 1.12 2008/05/16 16:31:01 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@@ -19,6 +19,46 @@
 #include "utils/builtins.h"


+Datum
+gin_cmp_tslexeme(PG_FUNCTION_ARGS)
+{
+	text    *a = PG_GETARG_TEXT_P(0);
+	text    *b = PG_GETARG_TEXT_P(1);
+	int     cmp;
+
+	cmp = tsCompareString(
+					VARDATA(a), VARSIZE(a) - VARHDRSZ,
+					VARDATA(b), VARSIZE(b) - VARHDRSZ,
+					false );
+
+	PG_FREE_IF_COPY(a,0);
+	PG_FREE_IF_COPY(b,1);
+	PG_RETURN_INT32( cmp );
+}
+
+Datum
+gin_cmp_prefix(PG_FUNCTION_ARGS)
+{
+	text    *a = PG_GETARG_TEXT_P(0);
+	text    *b = PG_GETARG_TEXT_P(1);
+#ifdef NOT_USED
+	StrategyNumber strategy = PG_GETARG_UINT16(2);
+#endif
+	int     cmp;
+
+	cmp = tsCompareString(
+					VARDATA(a), VARSIZE(a) - VARHDRSZ,
+					VARDATA(b), VARSIZE(b) - VARHDRSZ,
+					true );
+
+	if ( cmp < 0 )
+		cmp = 1;  /* prevent continue scan */
+
+	PG_FREE_IF_COPY(a,0);
+	PG_FREE_IF_COPY(b,1);
+	PG_RETURN_INT32( cmp );
+}
+
 Datum
 gin_extract_tsvector(PG_FUNCTION_ARGS)
 {
@@ -55,7 +95,9 @@ gin_extract_tsquery(PG_FUNCTION_ARGS)
 	TSQuery		query = PG_GETARG_TSQUERY(0);
 	int32	   *nentries = (int32 *) PG_GETARG_POINTER(1);
 	/* StrategyNumber strategy = PG_GETARG_UINT16(2); */
+	bool      **ptr_partialmatch = (bool**) PG_GETARG_POINTER(3);
 	Datum	   *entries = NULL;
+	bool       *partialmatch;

 	*nentries = 0;

@@ -65,12 +107,14 @@ gin_extract_tsquery(PG_FUNCTION_ARGS)
 					j = 0,
 					len;
 		QueryItem  *item;
+		bool		use_fullscan=false;

 		item = clean_NOT(GETQUERY(query), &len);
 		if (!item)
-			ereport(ERROR,
-					(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-					 errmsg("query requires full scan, which is not supported by GIN indexes")));
+		{
+			use_fullscan = true;
+			*nentries = 1;
+		}

 		item = GETQUERY(query);

@@ -79,6 +123,7 @@ gin_extract_tsquery(PG_FUNCTION_ARGS)
 				(*nentries)++;

 		entries = (Datum *) palloc(sizeof(Datum) * (*nentries));
+		partialmatch = *ptr_partialmatch = (bool*) palloc(sizeof(bool) * (*nentries));

 		for (i = 0; i < query->size; i++)
 			if (item[i].type == QI_VAL)
@@ -88,8 +133,12 @@ gin_extract_tsquery(PG_FUNCTION_ARGS)

 				txt = cstring_to_text_with_len(GETOPERAND(query) + val->distance,
 											   val->length);
+				partialmatch[j] = val->prefix;
 				entries[j++] = PointerGetDatum(txt);
 			}
+
+		if ( use_fullscan )
+			entries[j++] = PointerGetDatum(cstring_to_text_with_len("", 0));
 	}
 	else
 		*nentries = -1;			/* nothing can be found */
--- a/src/backend/utils/adt/tsgistidx.c
+++ b/src/backend/utils/adt/tsgistidx.c
@@ -7,7 +7,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/utils/adt/tsgistidx.c,v 1.8 2008/04/14 17:05:33 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/utils/adt/tsgistidx.c,v 1.9 2008/05/16 16:31:01 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@@ -307,6 +307,12 @@ checkcondition_arr(void *checkval, QueryOperand *val)

 	/* Loop invariant: StopLow <= val < StopHigh */

+	/* 
+	 * we are not able to find a a prefix by hash value 
+	 */
+	if ( val->prefix )
+		return true;
+
 	while (StopLow < StopHigh)
 	{
 		StopMiddle = StopLow + (StopHigh - StopLow) / 2;
@@ -324,6 +330,11 @@ checkcondition_arr(void *checkval, QueryOperand *val)
 static bool
 checkcondition_bit(void *checkval, QueryOperand *val)
 {
+	/* 
+	 * we are not able to find a a prefix in signature tree 
+	 */
+	if ( val->prefix )
+		return true; 
 	return GETBIT(checkval, HASHVAL(val->valcrc));
 }

--- a/src/backend/utils/adt/tsquery.c
+++ b/src/backend/utils/adt/tsquery.c
@@ -7,7 +7,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/utils/adt/tsquery.c,v 1.17 2008/04/11 22:52:05 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/utils/adt/tsquery.c,v 1.18 2008/05/16 16:31:01 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@@ -56,12 +56,14 @@ struct TSQueryParserStateData
 #define WAITSINGLEOPERAND 4

 /*
- * subroutine to parse the weight part, like ':1AB' of a query.
+ * subroutine to parse the modifiers (weight and prefix flag currently) 
+ * part, like ':1AB' of a query.
 */
 static char *
-get_weight(char *buf, int16 *weight)
+get_modifiers(char *buf, int16 *weight, bool *prefix)
 {
 	*weight = 0;
+	*prefix = false;

 	if (!t_iseq(buf, ':'))
 		return buf;
@@ -87,6 +89,9 @@ get_weight(char *buf, int16 *weight)
 			case 'D':
 				*weight |= 1;
 				break;
+			case '*':
+				*prefix = true;
+				break;
 			default:
 				return buf;
 		}
@@ -118,8 +123,11 @@ typedef enum
 static ts_tokentype
 gettoken_query(TSQueryParserState state,
 			   int8 *operator,
-			   int *lenval, char **strval, int16 *weight)
+			   int *lenval, char **strval, int16 *weight, bool *prefix)
 {
+	*weight = 0;
+	*prefix = false;
+
 	while (1)
 	{
 		switch (state->state)
@@ -157,7 +165,7 @@ gettoken_query(TSQueryParserState state,
 					reset_tsvector_parser(state->valstate, state->buf);
 					if (gettoken_tsvector(state->valstate, strval, lenval, NULL, NULL, &state->buf))
 					{
-						state->buf = get_weight(state->buf, weight);
+						state->buf = get_modifiers(state->buf, weight, prefix);
 						state->state = WAITOPERATOR;
 						return PT_VAL;
 					}
@@ -232,7 +240,7 @@ pushOperator(TSQueryParserState state, int8 oper)
 }

 static void
-pushValue_internal(TSQueryParserState state, pg_crc32 valcrc, int distance, int lenval, int weight)
+pushValue_internal(TSQueryParserState state, pg_crc32 valcrc, int distance, int lenval, int weight, bool prefix)
 {
 	QueryOperand *tmp;

@@ -250,6 +258,7 @@ pushValue_internal(TSQueryParserState state, pg_crc32 valcrc, int distance, int
 	tmp = (QueryOperand *) palloc0(sizeof(QueryOperand));
 	tmp->type = QI_VAL;
 	tmp->weight = weight;
+	tmp->prefix = prefix;
 	tmp->valcrc = (int32) valcrc;
 	tmp->length = lenval;
 	tmp->distance = distance;
@@ -264,7 +273,7 @@ pushValue_internal(TSQueryParserState state, pg_crc32 valcrc, int distance, int
 * of the string.
 */
 void
-pushValue(TSQueryParserState state, char *strval, int lenval, int2 weight)
+pushValue(TSQueryParserState state, char *strval, int lenval, int2 weight, bool prefix)
 {
 	pg_crc32	valcrc;

@@ -277,7 +286,7 @@ pushValue(TSQueryParserState state, char *strval, int lenval, int2 weight)
 	INIT_CRC32(valcrc);
 	COMP_CRC32(valcrc, strval, lenval);
 	FIN_CRC32(valcrc);
-	pushValue_internal(state, valcrc, state->curop - state->op, lenval, weight);
+	pushValue_internal(state, valcrc, state->curop - state->op, lenval, weight, prefix);

 	/* append the value string to state.op, enlarging buffer if needed first */
 	while (state->curop - state->op + lenval + 1 >= state->lenop)
@@ -330,16 +339,17 @@ makepol(TSQueryParserState state,
 	int8		opstack[STACKDEPTH];
 	int			lenstack = 0;
 	int16		weight = 0;
+	bool		prefix;

 	/* since this function recurses, it could be driven to stack overflow */
 	check_stack_depth();

-	while ((type = gettoken_query(state, &operator, &lenval, &strval, &weight)) != PT_END)
+	while ((type = gettoken_query(state, &operator, &lenval, &strval, &weight, &prefix)) != PT_END)
 	{
 		switch (type)
 		{
 			case PT_VAL:
-				pushval(opaque, state, strval, lenval, weight);
+				pushval(opaque, state, strval, lenval, weight, prefix);
 				while (lenstack && (opstack[lenstack - 1] == OP_AND ||
 									opstack[lenstack - 1] == OP_NOT))
 				{
@@ -549,9 +559,9 @@ parse_tsquery(char *buf,

 static void
 pushval_asis(Datum opaque, TSQueryParserState state, char *strval, int lenval,
-			 int16 weight)
+			 int16 weight, bool prefix)
 {
-	pushValue(state, strval, lenval, weight);
+	pushValue(state, strval, lenval, weight, prefix);
 }

 /*
@@ -605,7 +615,7 @@ infix(INFIX *in, bool first)
 		char	   *op = in->op + curpol->distance;
 		int			clen;

-		RESIZEBUF(in, curpol->length * (pg_database_encoding_max_length() + 1) + 2 + 5);
+		RESIZEBUF(in, curpol->length * (pg_database_encoding_max_length() + 1) + 2 + 6);
 		*(in->cur) = '\'';
 		in->cur++;
 		while (*op)
@@ -628,10 +638,15 @@ infix(INFIX *in, bool first)
 		}
 		*(in->cur) = '\'';
 		in->cur++;
-		if (curpol->weight)
+		if (curpol->weight || curpol->prefix)
 		{
 			*(in->cur) = ':';
 			in->cur++;
+			if ( curpol->prefix )
+			{
+				*(in->cur) = '*';
+				in->cur++;
+			}
 			if (curpol->weight & (1 << 3))
 			{
 				*(in->cur) = 'A';
@@ -769,6 +784,7 @@ tsqueryout(PG_FUNCTION_ARGS)
 * uint8	type, QI_VAL
 * uint8	weight
 *			operand text in client encoding, null-terminated
+ * uint8	prefix
 *
 * For each operator:
 * uint8	type, QI_OPR
@@ -793,6 +809,7 @@ tsquerysend(PG_FUNCTION_ARGS)
 		{
 			case QI_VAL:
 				pq_sendint(&buf, item->operand.weight, sizeof(uint8));
+				pq_sendint(&buf, item->operand.prefix, sizeof(uint8));
 				pq_sendstring(&buf, GETOPERAND(query) + item->operand.distance);
 				break;
 			case QI_OPR:
@@ -844,10 +861,12 @@ tsqueryrecv(PG_FUNCTION_ARGS)
 		{
 			size_t		val_len;	/* length after recoding to server encoding */
 			uint8		weight;
+			uint8		prefix;
 			const char *val;
 			pg_crc32	valcrc;

 			weight = (uint8) pq_getmsgint(buf, sizeof(uint8));
+			prefix = (uint8) pq_getmsgint(buf, sizeof(uint8));
 			val = pq_getmsgstring(buf);
 			val_len = strlen(val);

@@ -869,6 +888,7 @@ tsqueryrecv(PG_FUNCTION_ARGS)
 			FIN_CRC32(valcrc);

 			item->operand.weight = weight;
+			item->operand.prefix = (prefix) ? true : false;
 			item->operand.valcrc = (int32) valcrc;
 			item->operand.length = val_len;
 			item->operand.distance = datalen;
--- a/src/backend/utils/adt/tsquery_util.c
+++ b/src/backend/utils/adt/tsquery_util.c
@@ -7,7 +7,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/utils/adt/tsquery_util.c,v 1.8 2008/01/01 19:45:53 momjian Exp $
+ *	  $PostgreSQL: pgsql/src/backend/utils/adt/tsquery_util.c,v 1.9 2008/05/16 16:31:01 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@@ -125,10 +125,7 @@ QTNodeCompare(QTNode *an, QTNode *bn)
 			return (ao->valcrc > bo->valcrc) ? -1 : 1;
 		}

-		if (ao->length == bo->length)
-			return strncmp(an->word, bn->word, ao->length);
-		else
-			return (ao->length > bo->length) ? -1 : 1;
+		return tsCompareString( an->word, ao->length, bn->word, bo->length, false);
 	}
 }

--- a/src/backend/utils/adt/tsrank.c
+++ b/src/backend/utils/adt/tsrank.c
@@ -7,7 +7,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/utils/adt/tsrank.c,v 1.12 2008/01/01 19:45:53 momjian Exp $
+ *	  $PostgreSQL: pgsql/src/backend/utils/adt/tsrank.c,v 1.13 2008/05/16 16:31:01 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@@ -71,45 +71,60 @@ cnt_length(TSVector t)
 	return len;
 }

-static int
-WordECompareQueryItem(char *eval, char *qval, WordEntry *ptr, QueryOperand *item)
-{
-	if (ptr->len == item->length)
-		return strncmp(
-					   eval + ptr->pos,
-					   qval + item->distance,
-					   item->length);

-	return (ptr->len > item->length) ? 1 : -1;
-}
+#define	WordECompareQueryItem(e,q,p,i,m) \
+	tsCompareString((q) + (i)->distance, (i)->length,	\
+					(e) + (p)->pos,	(p)->len, (m))
+

 /*
- * Returns a pointer to a WordEntry corresponding 'item' from tsvector 't'. 'q'
- * is the TSQuery containing 'item'. Returns NULL if not found.
+ * Returns a pointer to a WordEntry's array corresponding to 'item' from
+ * tsvector 't'. 'q' is the TSQuery containing 'item'.
+ * Returns NULL if not found.
 */
 static WordEntry *
-find_wordentry(TSVector t, TSQuery q, QueryOperand *item)
+find_wordentry(TSVector t, TSQuery q, QueryOperand *item, int32 *nitem)
 {
 	WordEntry  *StopLow = ARRPTR(t);
 	WordEntry  *StopHigh = (WordEntry *) STRPTR(t);
-	WordEntry  *StopMiddle;
+	WordEntry  *StopMiddle = StopHigh;
 	int			difference;

-	/* Loop invariant: StopLow <= item < StopHigh */
+	*nitem=0;

+	/* Loop invariant: StopLow <= item < StopHigh */
 	while (StopLow < StopHigh)
 	{
 		StopMiddle = StopLow + (StopHigh - StopLow) / 2;
-		difference = WordECompareQueryItem(STRPTR(t), GETOPERAND(q), StopMiddle, item);
+		difference = WordECompareQueryItem(STRPTR(t), GETOPERAND(q), StopMiddle, item, false);
 		if (difference == 0)
-			return StopMiddle;
-		else if (difference < 0)
+		{
+			StopHigh = StopMiddle;
+			*nitem=1;
+			break;
+		}
+		else if (difference > 0)
 			StopLow = StopMiddle + 1;
 		else
 			StopHigh = StopMiddle;
 	}

-	return NULL;
+	if ( item->prefix == true )
+	{
+		if ( StopLow >= StopHigh )
+			StopMiddle = StopHigh;
+
+		*nitem=0;
+
+		while( StopMiddle < (WordEntry *) STRPTR(t) && 
+				WordECompareQueryItem(STRPTR(t), GETOPERAND(q), StopMiddle, item, true) == 0 )
+		{
+			(*nitem)++;
+			StopMiddle++;
+		}
+	}
+
+	return ( *nitem > 0 ) ? StopHigh : NULL;
 }


@@ -123,12 +138,9 @@ compareQueryOperand(const void *a, const void *b, void *arg)
 	QueryOperand *qa = (*(QueryOperand **) a);
 	QueryOperand *qb = (*(QueryOperand **) b);

-	if (qa->length == qb->length)
-		return strncmp(operand + qa->distance,
-					   operand + qb->distance,
-					   qb->length);
-
-	return (qa->length > qb->length) ? 1 : -1;
+	return tsCompareString(operand + qa->distance, qa->length,
+						   operand + qb->distance, qb->length,
+						   false);
 }

 /*
@@ -198,12 +210,14 @@ calc_rank_and(float *w, TSVector t, TSQuery q)
 				k,
 				l,
 				p;
-	WordEntry  *entry;
+	WordEntry  *entry,
+			   *firstentry;
 	WordEntryPos *post,
 			   *ct;
 	int4		dimt,
 				lenct,
-				dist;
+				dist,
+				nitem;
 	float		res = -1.0;
 	QueryOperand **item;
 	int			size = q->size;
@@ -219,40 +233,44 @@ calc_rank_and(float *w, TSVector t, TSQuery q)

 	for (i = 0; i < size; i++)
 	{
-		entry = find_wordentry(t, q, item[i]);
+		firstentry = entry = find_wordentry(t, q, item[i], &nitem);
 		if (!entry)
 			continue;

-		if (entry->haspos)
-			pos[i] = _POSVECPTR(t, entry);
-		else
-			pos[i] = &POSNULL;
-
-
-		dimt = pos[i]->npos;
-		post = pos[i]->pos;
-		for (k = 0; k < i; k++)
+		while( entry - firstentry < nitem )
 		{
-			if (!pos[k])
-				continue;
-			lenct = pos[k]->npos;
-			ct = pos[k]->pos;
-			for (l = 0; l < dimt; l++)
-			{
-				for (p = 0; p < lenct; p++)
-				{
-					dist = Abs((int) WEP_GETPOS(post[l]) - (int) WEP_GETPOS(ct[p]));
-					if (dist || (dist == 0 && (pos[i] == &POSNULL || pos[k] == &POSNULL)))
-					{
-						float		curw;
+			if (entry->haspos)
+				pos[i] = _POSVECPTR(t, entry);
+			else
+				pos[i] = &POSNULL;

-						if (!dist)
-							dist = MAXENTRYPOS;
-						curw = sqrt(wpos(post[l]) * wpos(ct[p]) * word_distance(dist));
-						res = (res < 0) ? curw : 1.0 - (1.0 - res) * (1.0 - curw);
+			dimt = pos[i]->npos;
+			post = pos[i]->pos;
+			for (k = 0; k < i; k++)
+			{
+				if (!pos[k])
+					continue;
+				lenct = pos[k]->npos;
+				ct = pos[k]->pos;
+				for (l = 0; l < dimt; l++)
+				{
+					for (p = 0; p < lenct; p++)
+					{
+						dist = Abs((int) WEP_GETPOS(post[l]) - (int) WEP_GETPOS(ct[p]));
+						if (dist || (dist == 0 && (pos[i] == &POSNULL || pos[k] == &POSNULL)))
+						{
+							float		curw;
+	
+							if (!dist)
+								dist = MAXENTRYPOS;
+							curw = sqrt(wpos(post[l]) * wpos(ct[p]) * word_distance(dist));
+							res = (res < 0) ? curw : 1.0 - (1.0 - res) * (1.0 - curw);
+						}
 					}
 				}
 			}
+
+			entry++;
 		}
 	}
 	pfree(pos);
@@ -263,11 +281,13 @@ calc_rank_and(float *w, TSVector t, TSQuery q)
 static float
 calc_rank_or(float *w, TSVector t, TSQuery q)
 {
-	WordEntry  *entry;
+	WordEntry  *entry,
+			   *firstentry;
 	WordEntryPos *post;
 	int4		dimt,
 				j,
-				i;
+				i,
+				nitem;
 	float		res = 0.0;
 	QueryOperand **item;
 	int			size = q->size;
@@ -280,41 +300,46 @@ calc_rank_or(float *w, TSVector t, TSQuery q)
 					wjm;
 		int4		jm;

-		entry = find_wordentry(t, q, item[i]);
+		firstentry = entry = find_wordentry(t, q, item[i], &nitem);
 		if (!entry)
 			continue;

-		if (entry->haspos)
+		while( entry - firstentry < nitem )
 		{
-			dimt = POSDATALEN(t, entry);
-			post = POSDATAPTR(t, entry);
-		}
-		else
-		{
-			dimt = POSNULL.npos;
-			post = POSNULL.pos;
-		}
-
-		resj = 0.0;
-		wjm = -1.0;
-		jm = 0;
-		for (j = 0; j < dimt; j++)
-		{
-			resj = resj + wpos(post[j]) / ((j + 1) * (j + 1));
-			if (wpos(post[j]) > wjm)
+			if (entry->haspos)
 			{
-				wjm = wpos(post[j]);
-				jm = j;
+				dimt = POSDATALEN(t, entry);
+				post = POSDATAPTR(t, entry);
+			}
+			else
+			{
+				dimt = POSNULL.npos;
+				post = POSNULL.pos;
+			}
+
+			resj = 0.0;
+			wjm = -1.0;
+			jm = 0;
+			for (j = 0; j < dimt; j++)
+			{
+				resj = resj + wpos(post[j]) / ((j + 1) * (j + 1));
+				if (wpos(post[j]) > wjm)
+				{
+					wjm = wpos(post[j]);
+					jm = j;
+				}
 			}
-		}
 /*
-		limit (sum(i/i^2),i->inf) = pi^2/6
-		resj = sum(wi/i^2),i=1,noccurence,
-		wi - should be sorted desc,
-		don't sort for now, just choose maximum weight. This should be corrected
-		Oleg Bartunov
+			limit (sum(i/i^2),i->inf) = pi^2/6
+			resj = sum(wi/i^2),i=1,noccurence,
+			wi - should be sorted desc,
+			don't sort for now, just choose maximum weight. This should be corrected
+			Oleg Bartunov
 */
-		res = res + (wjm + resj - wjm / ((jm + 1) * (jm + 1))) / 1.64493406685;
+			res = res + (wjm + resj - wjm / ((jm + 1) * (jm + 1))) / 1.64493406685;
+
+			entry++;
+		}
 	}
 	if (size > 0)
 		res = res / size;
@@ -594,11 +619,13 @@ static DocRepresentation *
 get_docrep(TSVector txt, QueryRepresentation *qr, int *doclen)
 {
 	QueryItem  *item = GETQUERY(qr->query);
-	WordEntry  *entry;
+	WordEntry  *entry,
+			   *firstentry;
 	WordEntryPos *post;
 	int4		dimt,
 				j,
-				i;
+				i,
+				nitem;
 	int			len = qr->query->size * 4,
 				cur = 0;
 	DocRepresentation *doc;
@@ -619,63 +646,68 @@ get_docrep(TSVector txt, QueryRepresentation *qr, int *doclen)
 		if (QR_GET_OPERAND_EXISTS(qr, &item[i]))
 			continue;

-		entry = find_wordentry(txt, qr->query, curoperand);
+		firstentry = entry = find_wordentry(txt, qr->query, curoperand, &nitem);
 		if (!entry)
 			continue;

-		if (entry->haspos)
+		while( entry - firstentry < nitem )
 		{
-			dimt = POSDATALEN(txt, entry);
-			post = POSDATAPTR(txt, entry);
-		}
-		else
-		{
-			dimt = POSNULL.npos;
-			post = POSNULL.pos;
-		}
-
-		while (cur + dimt >= len)
-		{
-			len *= 2;
-			doc = (DocRepresentation *) repalloc(doc, sizeof(DocRepresentation) * len);
-		}
-
-		for (j = 0; j < dimt; j++)
-		{
-			if (j == 0)
+			if (entry->haspos)
 			{
-				int			k;
-
-				doc[cur].nitem = 0;
-				doc[cur].item = (QueryItem **) palloc(sizeof(QueryItem *) * qr->query->size);
-
-				for (k = 0; k < qr->query->size; k++)
-				{
-					QueryOperand *kptr = &item[k].operand;
-					QueryOperand *iptr = &item[i].operand;
-
-					if (k == i ||
-						(item[k].type == QI_VAL &&
-						 compareQueryOperand(&kptr, &iptr, operand) == 0))
-					{
-						/*
-						 * if k == i, we've already checked above that it's
-						 * type == Q_VAL
-						 */
-						doc[cur].item[doc[cur].nitem] = item + k;
-						doc[cur].nitem++;
-						QR_SET_OPERAND_EXISTS(qr, item + k);
-					}
-				}
+				dimt = POSDATALEN(txt, entry);
+				post = POSDATAPTR(txt, entry);
 			}
 			else
 			{
-				doc[cur].nitem = doc[cur - 1].nitem;
-				doc[cur].item = doc[cur - 1].item;
+				dimt = POSNULL.npos;
+				post = POSNULL.pos;
 			}
-			doc[cur].pos = WEP_GETPOS(post[j]);
-			doc[cur].wclass = WEP_GETWEIGHT(post[j]);
-			cur++;
+
+			while (cur + dimt >= len)
+			{
+				len *= 2;
+				doc = (DocRepresentation *) repalloc(doc, sizeof(DocRepresentation) * len);
+			}
+
+			for (j = 0; j < dimt; j++)
+			{
+				if (j == 0)
+				{
+					int			k;
+	
+					doc[cur].nitem = 0;
+					doc[cur].item = (QueryItem **) palloc(sizeof(QueryItem *) * qr->query->size);
+	
+					for (k = 0; k < qr->query->size; k++)
+					{
+						QueryOperand *kptr = &item[k].operand;
+						QueryOperand *iptr = &item[i].operand;
+	
+						if (k == i ||
+							(item[k].type == QI_VAL &&
+							 compareQueryOperand(&kptr, &iptr, operand) == 0))
+						{
+							/*
+							 * if k == i, we've already checked above that it's
+							 * type == Q_VAL
+							 */
+							doc[cur].item[doc[cur].nitem] = item + k;
+							doc[cur].nitem++;
+							QR_SET_OPERAND_EXISTS(qr, item + k);
+						}
+					}
+				}
+				else
+				{
+					doc[cur].nitem = doc[cur - 1].nitem;
+					doc[cur].item = doc[cur - 1].item;
+				}
+				doc[cur].pos = WEP_GETPOS(post[j]);
+				doc[cur].wclass = WEP_GETWEIGHT(post[j]);
+				cur++;
+			}
+
+			entry++;
 		}
 	}

--- a/src/backend/utils/adt/tsvector.c
+++ b/src/backend/utils/adt/tsvector.c
@@ -7,7 +7,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/utils/adt/tsvector.c,v 1.13 2008/03/10 12:57:05 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/utils/adt/tsvector.c,v 1.14 2008/05/16 16:31:01 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@@ -85,14 +85,9 @@ compareentry(const void *va, const void *vb, void *arg)
 	const WordEntryIN *b = (const WordEntryIN *) vb;
 	char	   *BufferStr = (char *) arg;

-	if (a->entry.len == b->entry.len)
-	{
-		return strncmp(&BufferStr[a->entry.pos],
-					   &BufferStr[b->entry.pos],
-					   a->entry.len);
-	}
-
-	return (a->entry.len > b->entry.len) ? 1 : -1;
+	return tsCompareString( &BufferStr[a->entry.pos], a->entry.len,
+							&BufferStr[b->entry.pos], b->entry.len,
+							false );
 }

 /*
--- a/src/backend/utils/adt/tsvector_op.c
+++ b/src/backend/utils/adt/tsvector_op.c
@@ -7,7 +7,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/utils/adt/tsvector_op.c,v 1.15 2008/04/08 18:20:29 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/utils/adt/tsvector_op.c,v 1.16 2008/05/16 16:31:01 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@@ -127,11 +127,7 @@ silly_cmp_tsvector(const TSVector a, const TSVector b)
 			{
 				return (aptr->haspos > bptr->haspos) ? -1 : 1;
 			}
-			else if (aptr->len != bptr->len)
-			{
-				return (aptr->len > bptr->len) ? -1 : 1;
-			}
-			else if ((res = strncmp(STRPTR(a) + aptr->pos, STRPTR(b) + bptr->pos, bptr->len)) != 0)
+			else if ( (res=tsCompareString( STRPTR(a) + aptr->pos, aptr->len, STRPTR(b) + bptr->pos, bptr->len, false)) !=0 )
 			{
 				return res;
 			}
@@ -286,18 +282,10 @@ tsvector_setweight(PG_FUNCTION_ARGS)
 	PG_RETURN_POINTER(out);
 }

-static int
-compareEntry(char *ptra, WordEntry *a, char *ptrb, WordEntry *b)
-{
-	if (a->len == b->len)
-	{
-		return strncmp(
-					   ptra + a->pos,
-					   ptrb + b->pos,
-					   a->len);
-	}
-	return (a->len > b->len) ? 1 : -1;
-}
+#define compareEntry(pa, a, pb, b) \
+	tsCompareString((pa) + (a)->pos, (a)->len,	\
+					(pb) + (b)->pos, (b)->len,	\
+					false)

 /*
 * Add positions from src to dest after offsetting them by maxpos.
@@ -534,18 +522,46 @@ tsvector_concat(PG_FUNCTION_ARGS)
 }

 /*
- * compare 2 string values
+ * Compare two strings by tsvector rules. 
+ * if isPrefix = true then it returns not-zero value if b has prefix a
 */
-static int4
-ValCompare(CHKVAL *chkval, WordEntry *ptr, QueryOperand *item)
+int4
+tsCompareString(char *a, int lena, char *b, int lenb, bool prefix)
 {
-	if (ptr->len == item->length)
-		return strncmp(
-					   &(chkval->values[ptr->pos]),
-					   &(chkval->operand[item->distance]),
-					   item->length);
+	int cmp;

-	return (ptr->len > item->length) ? 1 : -1;
+	if ( lena == 0 )
+	{
+		if ( prefix )
+			cmp = 0; /* emtry string is equal to any if a prefix match */ 
+		else
+			cmp = (lenb>0) ? -1 : 0;
+	}
+	else if ( lenb == 0 )
+	{
+		cmp = (lena>0) ? 1 : 0;
+	}
+	else
+	{
+		cmp = memcmp(a, b, Min(lena, lenb));
+
+		if ( prefix )
+		{
+			if ( cmp == 0 && lena > lenb )
+			{
+				/*
+				 * b argument is not beginning with argument a
+				 */
+				cmp=1;
+			}
+		}
+		else if ( (cmp == 0) && (lena != lenb) )
+		{
+			cmp = (lena < lenb) ? -1 : 1;
+		}
+	}
+
+	return cmp;
 }

 /*
@@ -582,25 +598,52 @@ checkcondition_str(void *checkval, QueryOperand *val)
 	CHKVAL	   *chkval = (CHKVAL *) checkval;
 	WordEntry  *StopLow = chkval->arrb;
 	WordEntry  *StopHigh = chkval->arre;
-	WordEntry  *StopMiddle;
-	int			difference;
+	WordEntry  *StopMiddle = StopHigh;
+	int			difference = -1; 
+	bool		res=false;

 	/* Loop invariant: StopLow <= val < StopHigh */
-
 	while (StopLow < StopHigh)
 	{
 		StopMiddle = StopLow + (StopHigh - StopLow) / 2;
-		difference = ValCompare(chkval, StopMiddle, val);
+		difference = tsCompareString( chkval->operand + val->distance, val->length,
+									  chkval->values + StopMiddle->pos, StopMiddle->len,
+									  false);
+
 		if (difference == 0)
-			return (val->weight && StopMiddle->haspos) ?
+		{
+			res = (val->weight && StopMiddle->haspos) ?
 				checkclass_str(chkval, StopMiddle, val) : true;
-		else if (difference < 0)
+			break;
+		}
+		else if (difference > 0)
 			StopLow = StopMiddle + 1;
 		else
 			StopHigh = StopMiddle;
 	}

-	return (false);
+	if ( res == false && val->prefix == true )
+	{
+		/*
+		 * there was a failed exact search, so we should scan further to find
+		 * a prefix match.
+		 */
+		if ( StopLow >= StopHigh )
+			StopMiddle = StopHigh;
+
+		while( res == false && StopMiddle < chkval->arre && 
+				tsCompareString( chkval->operand + val->distance, val->length,
+								 chkval->values + StopMiddle->pos, StopMiddle->len,
+								 true) == 0 )
+		{
+			res = (val->weight && StopMiddle->haspos) ?
+				checkclass_str(chkval, StopMiddle, val) : true;
+
+			StopMiddle++;
+		}
+	}
+
+	return res; 
 }

 /*
@@ -758,50 +801,38 @@ check_weight(TSVector txt, WordEntry *wptr, int8 weight)
 	return num;
 }

-static WordEntry **
-SEI_realloc(WordEntry **in, uint32 *len)
-{
-	if (*len == 0 || in == NULL)
-	{
-		*len = 8;
-		in = palloc(sizeof(WordEntry *) * (*len));
-	}
-	else
-	{
-		*len *= 2;
-		in = repalloc(in, sizeof(WordEntry *) * (*len));
-	}
-	return in;
-}
+#define compareStatWord(a,e,s,t) \
+	tsCompareString(STATSTRPTR(s) + (a)->pos, (a)->len,	\
+					STRPTR(t) + (e)->pos, (e)->len,		\
+					false)

-static int
-compareStatWord(StatEntry *a, WordEntry *b, tsstat *stat, TSVector txt)
+typedef struct WordEntryMark
 {
-	if (a->len == b->len)
-		return strncmp(
-					   STATSTRPTR(stat) + a->pos,
-					   STRPTR(txt) + b->pos,
-					   a->len
-			);
-	return (a->len > b->len) ? 1 : -1;
-}
+	WordEntry	*newentry;
+	StatEntry	*pos;
+} WordEntryMark;

 static tsstat *
-formstat(tsstat *stat, TSVector txt, WordEntry **entry, uint32 len)
+formstat(tsstat *stat, TSVector txt, List *entries)
 {
-	tsstat	   *newstat;
-	uint32		totallen,
-				nentry;
-	uint32		slen = 0;
-	WordEntry **ptr = entry;
-	char	   *curptr;
-	StatEntry  *sptr,
-			   *nptr;
+	tsstat		   *newstat;
+	uint32			totallen,
+					nentry,
+					len = list_length(entries);
+	uint32			slen = 0;
+	WordEntry	   *ptr;
+	char		   *curptr;
+	StatEntry	   *sptr,
+				   *nptr;
+	ListCell  	   *entry;
+	StatEntry  	   *PosSE = STATPTR(stat),
+				   *prevPosSE;
+	WordEntryMark  *mark;

-	while (ptr - entry < len)
+	foreach( entry, entries )
 	{
-		slen += (*ptr)->len;
-		ptr++;
+		mark = (WordEntryMark*)lfirst(entry);
+		slen += mark->newentry->len;
 	}

 	nentry = stat->size + len;
@@ -815,78 +846,46 @@ formstat(tsstat *stat, TSVector txt, WordEntry **entry, uint32 len)
 	memcpy(STATSTRPTR(newstat), STATSTRPTR(stat), STATSTRSIZE(stat));
 	curptr = STATSTRPTR(newstat) + STATSTRSIZE(stat);

-	ptr = entry;
 	sptr = STATPTR(stat);
 	nptr = STATPTR(newstat);

-	if (len == 1)
+	foreach(entry, entries)
 	{
-		StatEntry  *StopLow = STATPTR(stat);
-		StatEntry  *StopHigh = (StatEntry *) STATSTRPTR(stat);
+		prevPosSE = PosSE;

-		while (StopLow < StopHigh)
+		mark = (WordEntryMark*)lfirst(entry);
+		ptr  = mark->newentry;
+		PosSE = mark->pos;
+
+		/*
+		 * Copy missed entries 
+		 */
+		if ( PosSE > prevPosSE )
 		{
-			sptr = StopLow + (StopHigh - StopLow) / 2;
-			if (compareStatWord(sptr, *ptr, stat, txt) < 0)
-				StopLow = sptr + 1;
-			else
-				StopHigh = sptr;
+			memcpy( nptr, prevPosSE, sizeof(StatEntry) * (PosSE-prevPosSE) );
+			nptr += PosSE-prevPosSE;
 		}
-		nptr = STATPTR(newstat) + (StopLow - STATPTR(stat));
-		memcpy(STATPTR(newstat), STATPTR(stat), sizeof(StatEntry) * (StopLow - STATPTR(stat)));
-		if ((*ptr)->haspos)
-			nptr->nentry = (stat->weight) ? check_weight(txt, *ptr, stat->weight) : POSDATALEN(txt, *ptr);
+
+		/*
+		 * Copy new entry
+		 */
+		if (ptr->haspos)
+			nptr->nentry = (stat->weight) ? check_weight(txt, ptr, stat->weight) : POSDATALEN(txt, ptr);
 		else
 			nptr->nentry = 1;
 		nptr->ndoc = 1;
-		nptr->len = (*ptr)->len;
-		memcpy(curptr, STRPTR(txt) + (*ptr)->pos, nptr->len);
+		nptr->len = ptr->len;
+		memcpy(curptr, STRPTR(txt) + ptr->pos, nptr->len);
 		nptr->pos = curptr - STATSTRPTR(newstat);
-		memcpy(nptr + 1, StopLow, sizeof(StatEntry) * (((StatEntry *) STATSTRPTR(stat)) - StopLow));
-	}
-	else
-	{
-		while (sptr - STATPTR(stat) < stat->size && ptr - entry < len)
-		{
-			if (compareStatWord(sptr, *ptr, stat, txt) < 0)
-			{
-				memcpy(nptr, sptr, sizeof(StatEntry));
-				sptr++;
-			}
-			else
-			{
-				if ((*ptr)->haspos)
-					nptr->nentry = (stat->weight) ? check_weight(txt, *ptr, stat->weight) : POSDATALEN(txt, *ptr);
-				else
-					nptr->nentry = 1;
-				nptr->ndoc = 1;
-				nptr->len = (*ptr)->len;
-				memcpy(curptr, STRPTR(txt) + (*ptr)->pos, nptr->len);
-				nptr->pos = curptr - STATSTRPTR(newstat);
-				curptr += nptr->len;
-				ptr++;
-			}
-			nptr++;
-		}
+		curptr += nptr->len;
+		nptr++;

-		memcpy(nptr, sptr, sizeof(StatEntry) * (stat->size - (sptr - STATPTR(stat))));
-
-		while (ptr - entry < len)
-		{
-			if ((*ptr)->haspos)
-				nptr->nentry = (stat->weight) ? check_weight(txt, *ptr, stat->weight) : POSDATALEN(txt, *ptr);
-			else
-				nptr->nentry = 1;
-			nptr->ndoc = 1;
-			nptr->len = (*ptr)->len;
-			memcpy(curptr, STRPTR(txt) + (*ptr)->pos, nptr->len);
-			nptr->pos = curptr - STATSTRPTR(newstat);
-			curptr += nptr->len;
-			ptr++;
-			nptr++;
-		}
+		pfree(mark);
 	}

+	if ( PosSE < (StatEntry *) STATSTRPTR(stat) )
+		memcpy(nptr, PosSE, sizeof(StatEntry) * (stat->size - (PosSE - STATPTR(stat))));
+
 	return newstat;
 }

@@ -907,12 +906,11 @@ ts_accum(tsstat *stat, Datum data)
 {
 	tsstat	   *newstat;
 	TSVector	txt = DatumGetTSVector(data);
-	WordEntry **newentry = NULL;
-	uint32		len = 0,
-				cur = 0;
 	StatEntry  *sptr;
 	WordEntry  *wptr;
 	int			n = 0;
+	List	   *newentries=NIL;
+	StatEntry  *StopLow;

 	if (stat == NULL)
 	{							/* Init in first */
@@ -932,16 +930,23 @@ ts_accum(tsstat *stat, Datum data)

 	sptr = STATPTR(stat);
 	wptr = ARRPTR(txt);
+	StopLow = STATPTR(stat);

-	if (stat->size < 100 * txt->size)
-	{							/* merge */
-		while (sptr - STATPTR(stat) < stat->size && wptr - ARRPTR(txt) < txt->size)
+	while (wptr - ARRPTR(txt) < txt->size)
+	{
+		StatEntry  *StopHigh = (StatEntry *) STATSTRPTR(stat);
+		int			cmp;
+
+		/*
+		 * We do not set StopLow to begin of array because tsvector is ordered 
+		 * with the sames rule, so we can search from last stopped position
+		 */
+
+		while (StopLow < StopHigh)
 		{
-			int			cmp = compareStatWord(sptr, wptr, stat, txt);
-
-			if (cmp < 0)
-				sptr++;
-			else if (cmp == 0)
+			sptr = StopLow + (StopHigh - StopLow) / 2;
+			cmp = compareStatWord(sptr, wptr, stat, txt);
+			if (cmp == 0)
 			{
 				if (stat->weight == 0)
 				{
@@ -953,90 +958,38 @@ ts_accum(tsstat *stat, Datum data)
 					sptr->ndoc++;
 					sptr->nentry += n;
 				}
-				sptr++;
-				wptr++;
+				break;
 			}
+			else if (cmp < 0)
+				StopLow = sptr + 1;
 			else
-			{
-				if (stat->weight == 0 || check_weight(txt, wptr, stat->weight) != 0)
-				{
-					if (cur == len)
-						newentry = SEI_realloc(newentry, &len);
-					newentry[cur] = wptr;
-					cur++;
-				}
-				wptr++;
-			}
+				StopHigh = sptr;
 		}

-		while (wptr - ARRPTR(txt) < txt->size)
-		{
+		if (StopLow >= StopHigh)
+		{					/* not found */
 			if (stat->weight == 0 || check_weight(txt, wptr, stat->weight) != 0)
 			{
-				if (cur == len)
-					newentry = SEI_realloc(newentry, &len);
-				newentry[cur] = wptr;
-				cur++;
-			}
-			wptr++;
-		}
-	}
-	else
-	{							/* search */
-		while (wptr - ARRPTR(txt) < txt->size)
-		{
-			StatEntry  *StopLow = STATPTR(stat);
-			StatEntry  *StopHigh = (StatEntry *) STATSTRPTR(stat);
-			int			cmp;
+				WordEntryMark *mark = (WordEntryMark*)palloc(sizeof(WordEntryMark));

-			while (StopLow < StopHigh)
-			{
-				sptr = StopLow + (StopHigh - StopLow) / 2;
-				cmp = compareStatWord(sptr, wptr, stat, txt);
-				if (cmp == 0)
-				{
-					if (stat->weight == 0)
-					{
-						sptr->ndoc++;
-						sptr->nentry += (wptr->haspos) ? POSDATALEN(txt, wptr) : 1;
-					}
-					else if (wptr->haspos && (n = check_weight(txt, wptr, stat->weight)) != 0)
-					{
-						sptr->ndoc++;
-						sptr->nentry += n;
-					}
-					break;
-				}
-				else if (cmp < 0)
-					StopLow = sptr + 1;
-				else
-					StopHigh = sptr;
-			}
+				mark->newentry = wptr;
+				mark->pos = StopLow;
+				newentries = lappend( newentries, mark );

-			if (StopLow >= StopHigh)
-			{					/* not found */
-				if (stat->weight == 0 || check_weight(txt, wptr, stat->weight) != 0)
-				{
-					if (cur == len)
-						newentry = SEI_realloc(newentry, &len);
-					newentry[cur] = wptr;
-					cur++;
-				}
 			}
-			wptr++;
 		}
+		wptr++;
 	}

-
-	if (cur == 0)
+	if (list_length(newentries) == 0)
 	{							/* no new words */
 		if (txt != (TSVector) DatumGetPointer(data))
 			pfree(txt);
 		return stat;
 	}

-	newstat = formstat(stat, txt, newentry, cur);
-	pfree(newentry);
+	newstat = formstat(stat, txt, newentries);
+	list_free(newentries);

 	if (txt != (TSVector) DatumGetPointer(data))
 		pfree(txt);