Fix shared tsvector/tsquery input code so that we don't say "syntax error in

tsvector" when we are really parsing a tsquery. Report the bogus input, too. Make styles of some related error messages more consistent.
2025-10-18 04:29:09 +03:00 · 2007-10-21 22:29:56 +00:00
parent dfc6f130b4
commit 1ea47dd8cb
4 changed files with 81 additions and 68 deletions
--- a/src/backend/utils/adt/tsvector_parser.c
+++ b/src/backend/utils/adt/tsvector_parser.c
@@ -7,7 +7,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/utils/adt/tsvector_parser.c,v 1.1 2007/09/07 15:09:56 teodor Exp $
+ *	  $PostgreSQL: pgsql/src/backend/utils/adt/tsvector_parser.c,v 1.2 2007/10/21 22:29:56 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@@ -20,35 +20,49 @@
 #include "tsearch/ts_utils.h"
 #include "utils/memutils.h"

+
+/*
+ * Private state of tsvector parser.  Note that tsquery also uses this code to
+ * parse its input, hence the boolean flags.  The two flags are both true or
+ * both false in current usage, but we keep them separate for clarity.
+ * is_tsquery affects *only* the content of error messages.
+ */
 struct TSVectorParseStateData
 {
-	char   *prsbuf;
-	char   *word;		/* buffer to hold the current word */
-	int		len;		/* size in bytes allocated for 'word' */
-	bool	oprisdelim;
+	char   *prsbuf;				/* next input character */
+	char   *bufstart;			/* whole string (used only for errors) */
+	char   *word;				/* buffer to hold the current word */
+	int		len;				/* size in bytes allocated for 'word' */
+	int		eml;				/* max bytes per character */
+	bool	oprisdelim;			/* treat ! | * ( ) as delimiters? */
+	bool	is_tsquery;			/* say "tsquery" not "tsvector" in errors? */
 };

+
 /*
 * Initializes parser for the input string. If oprisdelim is set, the
 * following characters are treated as delimiters in addition to whitespace:
 * ! | & ( )
 */
 TSVectorParseState
-init_tsvector_parser(char *input, bool oprisdelim)
+init_tsvector_parser(char *input, bool oprisdelim, bool is_tsquery)
 {
 	TSVectorParseState state;

 	state = (TSVectorParseState) palloc(sizeof(struct TSVectorParseStateData));
 	state->prsbuf = input;
+	state->bufstart = input;
 	state->len = 32;
 	state->word = (char *) palloc(state->len);
+	state->eml = pg_database_encoding_max_length();
 	state->oprisdelim = oprisdelim;
+	state->is_tsquery = is_tsquery;

 	return state;
 }

 /*
- * Reinitializes parser for parsing 'input', instead of previous input.
+ * Reinitializes parser to parse 'input', instead of previous input.
 */
 void
 reset_tsvector_parser(TSVectorParseState state, char *input)
@@ -66,21 +80,21 @@ close_tsvector_parser(TSVectorParseState state)
 	pfree(state);
 }

+/* increase the size of 'word' if needed to hold one more character */
 #define RESIZEPRSBUF \
 do { \
-	if ( curpos - state->word + pg_database_encoding_max_length() >= state->len ) \
+	int clen = curpos - state->word; \
+	if ( clen + state->eml >= state->len ) \
 	{ \
-		int clen = curpos - state->word; \
 		state->len *= 2; \
-		state->word = (char*)repalloc( (void*)state->word, state->len ); \
+		state->word = (char *) repalloc(state->word, state->len); \
 		curpos = state->word + clen; \
 	} \
 } while (0)

-
 #define ISOPERATOR(x)	( pg_mblen(x)==1 && ( *(x)=='!' || *(x)=='&' || *(x)=='|' || *(x)=='(' || *(x)==')' ) )

-/* Fills the output parameters, and returns true */
+/* Fills gettoken_tsvector's output parameters, and returns true */
 #define RETURN_TOKEN \
 do { \
 	if (pos_ptr != NULL) \
@@ -111,18 +125,34 @@ do { \
 #define WAITPOSDELIM	7
 #define WAITCHARCMPLX	8

+#define PRSSYNTAXERROR prssyntaxerror(state)
+
+static void
+prssyntaxerror(TSVectorParseState state)
+{
+	ereport(ERROR,
+			(errcode(ERRCODE_SYNTAX_ERROR),
+			 state->is_tsquery ?
+			 errmsg("syntax error in tsquery: \"%s\"", state->bufstart) :
+			 errmsg("syntax error in tsvector: \"%s\"", state->bufstart)));
+}
+
+
 /*
- * Get next token from string being parsed. Returns false if
- * end of input string is reached, otherwise strval, lenval, pos_ptr
- * and poslen output parameters are filled in:
+ * Get next token from string being parsed. Returns true if successful,
+ * false if end of input string is reached.  On success, these output
+ * parameters are filled in:
 * 
- * *strval 		token
- * *lenval 		length of*strval
+ * *strval 		pointer to token
+ * *lenval 		length of *strval
 * *pos_ptr		pointer to a palloc'd array of positions and weights
 * 				associated with the token. If the caller is not interested
 *				in the information, NULL can be supplied. Otherwise
 *				the caller is responsible for pfreeing the array.
 * *poslen		number of elements in *pos_ptr
+ * *endptr		scan resumption point
+ *
+ * Pass NULL for unwanted output parameters.
 */
 bool
 gettoken_tsvector(TSVectorParseState state, 
@@ -155,9 +185,7 @@ gettoken_tsvector(TSVectorParseState state,
 				oldstate = WAITENDWORD;
 			}
 			else if (state->oprisdelim && ISOPERATOR(state->prsbuf))
-				ereport(ERROR,
-						(errcode(ERRCODE_SYNTAX_ERROR),
-						 errmsg("syntax error in tsvector")));
+				PRSSYNTAXERROR;
 			else if (!t_isspace(state->prsbuf))
 			{
 				COPYCHAR(curpos, state->prsbuf);
@@ -170,7 +198,8 @@ gettoken_tsvector(TSVectorParseState state,
 			if (*(state->prsbuf) == '\0')
 				ereport(ERROR,
 						(errcode(ERRCODE_SYNTAX_ERROR),
-						 errmsg("there is no escaped character")));
+						 errmsg("there is no escaped character: \"%s\"",
+								state->bufstart)));
 			else
 			{
 				RESIZEPRSBUF;
@@ -192,18 +221,14 @@ gettoken_tsvector(TSVectorParseState state,
 			{
 				RESIZEPRSBUF;
 				if (curpos == state->word)
-					ereport(ERROR,
-							(errcode(ERRCODE_SYNTAX_ERROR),
-							 errmsg("syntax error in tsvector")));
+					PRSSYNTAXERROR;
 				*(curpos) = '\0';
 				RETURN_TOKEN;
 			}
 			else if (t_iseq(state->prsbuf, ':'))
 			{
 				if (curpos == state->word)
-					ereport(ERROR,
-							(errcode(ERRCODE_SYNTAX_ERROR),
-							 errmsg("syntax error in tsvector")));
+					PRSSYNTAXERROR;
 				*(curpos) = '\0';
 				if (state->oprisdelim)
 					RETURN_TOKEN;
@@ -229,9 +254,7 @@ gettoken_tsvector(TSVectorParseState state,
 				oldstate = WAITENDCMPLX;
 			}
 			else if (*(state->prsbuf) == '\0')
-				ereport(ERROR,
-						(errcode(ERRCODE_SYNTAX_ERROR),
-						 errmsg("syntax error in tsvector")));
+				PRSSYNTAXERROR;
 			else
 			{
 				RESIZEPRSBUF;
@@ -253,9 +276,7 @@ gettoken_tsvector(TSVectorParseState state,
 				RESIZEPRSBUF;
 				*(curpos) = '\0';
 				if (curpos == state->word)
-					ereport(ERROR,
-							(errcode(ERRCODE_SYNTAX_ERROR),
-							 errmsg("syntax error in tsvector")));
+					PRSSYNTAXERROR;
 				if (state->oprisdelim)
 				{
 					/* state->prsbuf+=pg_mblen(state->prsbuf); */
@@ -290,17 +311,17 @@ gettoken_tsvector(TSVectorParseState state,
 				}
 				npos++;
 				WEP_SETPOS(pos[npos - 1], LIMITPOS(atoi(state->prsbuf)));
+				/* we cannot get here in tsquery, so no need for 2 errmsgs */
 				if (WEP_GETPOS(pos[npos - 1]) == 0)
 					ereport(ERROR,
 							(errcode(ERRCODE_SYNTAX_ERROR),
-							 errmsg("wrong position info in tsvector")));
+							 errmsg("wrong position info in tsvector: \"%s\"",
+									state->bufstart)));
 				WEP_SETWEIGHT(pos[npos - 1], 0);
 				statecode = WAITPOSDELIM;
 			}
 			else
-				ereport(ERROR,
-						(errcode(ERRCODE_SYNTAX_ERROR),
-						 errmsg("syntax error in tsvector")));
+				PRSSYNTAXERROR;
 		}
 		else if (statecode == WAITPOSDELIM)
 		{
@@ -309,42 +330,32 @@ gettoken_tsvector(TSVectorParseState state,
 			else if (t_iseq(state->prsbuf, 'a') || t_iseq(state->prsbuf, 'A') || t_iseq(state->prsbuf, '*'))
 			{
 				if (WEP_GETWEIGHT(pos[npos - 1]))
-					ereport(ERROR,
-							(errcode(ERRCODE_SYNTAX_ERROR),
-							 errmsg("syntax error in tsvector")));
+					PRSSYNTAXERROR;
 				WEP_SETWEIGHT(pos[npos - 1], 3);
 			}
 			else if (t_iseq(state->prsbuf, 'b') || t_iseq(state->prsbuf, 'B'))
 			{
 				if (WEP_GETWEIGHT(pos[npos - 1]))
-					ereport(ERROR,
-							(errcode(ERRCODE_SYNTAX_ERROR),
-							 errmsg("syntax error in tsvector")));
+					PRSSYNTAXERROR;
 				WEP_SETWEIGHT(pos[npos - 1], 2);
 			}
 			else if (t_iseq(state->prsbuf, 'c') || t_iseq(state->prsbuf, 'C'))
 			{
 				if (WEP_GETWEIGHT(pos[npos - 1]))
-					ereport(ERROR,
-							(errcode(ERRCODE_SYNTAX_ERROR),
-							 errmsg("syntax error in tsvector")));
+					PRSSYNTAXERROR;
 				WEP_SETWEIGHT(pos[npos - 1], 1);
 			}
 			else if (t_iseq(state->prsbuf, 'd') || t_iseq(state->prsbuf, 'D'))
 			{
 				if (WEP_GETWEIGHT(pos[npos - 1]))
-					ereport(ERROR,
-							(errcode(ERRCODE_SYNTAX_ERROR),
-							 errmsg("syntax error in tsvector")));
+					PRSSYNTAXERROR;
 				WEP_SETWEIGHT(pos[npos - 1], 0);
 			}
 			else if (t_isspace(state->prsbuf) ||
 					 *(state->prsbuf) == '\0')
 				RETURN_TOKEN;
 			else if (!t_isdigit(state->prsbuf))
-				ereport(ERROR,
-						(errcode(ERRCODE_SYNTAX_ERROR),
-						 errmsg("syntax error in tsvector")));
+				PRSSYNTAXERROR;
 		}
 		else					/* internal error */
 			elog(ERROR, "internal error in gettoken_tsvector");