postgres/src/backend/utils/adt/tsvector_parser.c

/*-------------------------------------------------------------------------
 *
 * tsvector_parser.c
 *	  Parser for tsvector
 *
 * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
 *
 *
 * IDENTIFICATION
 *	  src/backend/utils/adt/tsvector_parser.c
 *
 *-------------------------------------------------------------------------
 */

#include "postgres.h"

#include "tsearch/ts_locale.h"
#include "tsearch/ts_utils.h"


/*
 * Private state of tsvector parser.  Note that tsquery also uses this code to
 * parse its input, hence the boolean flags.  The two flags are both true or
 * both false in current usage, but we keep them separate for clarity.
 * is_tsquery affects *only* the content of error messages.
 */
struct TSVectorParseStateData
{
	char	   *prsbuf;			/* next input character */
	char	   *bufstart;		/* whole string (used only for errors) */
	char	   *word;			/* buffer to hold the current word */
	int			len;			/* size in bytes allocated for 'word' */
	int			eml;			/* max bytes per character */
	bool		oprisdelim;		/* treat ! | * ( ) as delimiters? */
	bool		is_tsquery;		/* say "tsquery" not "tsvector" in errors? */
};


/*
 * Initializes parser for the input string. If oprisdelim is set, the
 * following characters are treated as delimiters in addition to whitespace:
 * ! | & ( )
 */
TSVectorParseState
init_tsvector_parser(char *input, bool oprisdelim, bool is_tsquery)
{
	TSVectorParseState state;

	state = (TSVectorParseState) palloc(sizeof(struct TSVectorParseStateData));
	state->prsbuf = input;
	state->bufstart = input;
	state->len = 32;
	state->word = (char *) palloc(state->len);
	state->eml = pg_database_encoding_max_length();
	state->oprisdelim = oprisdelim;
	state->is_tsquery = is_tsquery;

	return state;
}

/*
 * Reinitializes parser to parse 'input', instead of previous input.
 */
void
reset_tsvector_parser(TSVectorParseState state, char *input)
{
	state->prsbuf = input;
}

/*
 * Shuts down a tsvector parser.
 */
void
close_tsvector_parser(TSVectorParseState state)
{
	pfree(state->word);
	pfree(state);
}

/* increase the size of 'word' if needed to hold one more character */
#define RESIZEPRSBUF \
do { \
	int clen = curpos - state->word; \
	if ( clen + state->eml >= state->len ) \
	{ \
		state->len *= 2; \
		state->word = (char *) repalloc(state->word, state->len); \
		curpos = state->word + clen; \
	} \
} while (0)

#define ISOPERATOR(x)	( pg_mblen(x)==1 && ( *(x)=='!' || *(x)=='&' || *(x)=='|' || *(x)=='(' || *(x)==')' ) )

/* Fills gettoken_tsvector's output parameters, and returns true */
#define RETURN_TOKEN \
do { \
	if (pos_ptr != NULL) \
	{ \
		*pos_ptr = pos; \
		*poslen = npos; \
	} \
	else if (pos != NULL) \
		pfree(pos); \
	\
	if (strval != NULL) \
		*strval = state->word; \
	if (lenval != NULL) \
		*lenval = curpos - state->word; \
	if (endptr != NULL) \
		*endptr = state->prsbuf; \
	return true; \
} while(0)


/* State codes used in gettoken_tsvector */
#define WAITWORD		1
#define WAITENDWORD		2
#define WAITNEXTCHAR	3
#define WAITENDCMPLX	4
#define WAITPOSINFO		5
#define INPOSINFO		6
#define WAITPOSDELIM	7
#define WAITCHARCMPLX	8

#define PRSSYNTAXERROR prssyntaxerror(state)

static void
prssyntaxerror(TSVectorParseState state)
{
	ereport(ERROR,
			(errcode(ERRCODE_SYNTAX_ERROR),
			 state->is_tsquery ?
			 errmsg("syntax error in tsquery: \"%s\"", state->bufstart) :
			 errmsg("syntax error in tsvector: \"%s\"", state->bufstart)));
}


/*
 * Get next token from string being parsed. Returns true if successful,
 * false if end of input string is reached.  On success, these output
 * parameters are filled in:
 *
 * *strval		pointer to token
 * *lenval		length of *strval
 * *pos_ptr		pointer to a palloc'd array of positions and weights
 *				associated with the token. If the caller is not interested
 *				in the information, NULL can be supplied. Otherwise
 *				the caller is responsible for pfreeing the array.
 * *poslen		number of elements in *pos_ptr
 * *endptr		scan resumption point
 *
 * Pass NULL for unwanted output parameters.
 */
bool
gettoken_tsvector(TSVectorParseState state,
				  char **strval, int *lenval,
				  WordEntryPos **pos_ptr, int *poslen,
				  char **endptr)
{
	int			oldstate = 0;
	char	   *curpos = state->word;
	int			statecode = WAITWORD;

	/*
	 * pos is for collecting the comma delimited list of positions followed by
	 * the actual token.
	 */
	WordEntryPos *pos = NULL;
	int			npos = 0;		/* elements of pos used */
	int			posalen = 0;	/* allocated size of pos */

	while (1)
	{
		if (statecode == WAITWORD)
		{
			if (*(state->prsbuf) == '\0')
				return false;
			else if (t_iseq(state->prsbuf, '\''))
				statecode = WAITENDCMPLX;
			else if (t_iseq(state->prsbuf, '\\'))
			{
				statecode = WAITNEXTCHAR;
				oldstate = WAITENDWORD;
			}
			else if (state->oprisdelim && ISOPERATOR(state->prsbuf))
				PRSSYNTAXERROR;
			else if (!t_isspace(state->prsbuf))
			{
				COPYCHAR(curpos, state->prsbuf);
				curpos += pg_mblen(state->prsbuf);
				statecode = WAITENDWORD;
			}
		}
		else if (statecode == WAITNEXTCHAR)
		{
			if (*(state->prsbuf) == '\0')
				ereport(ERROR,
						(errcode(ERRCODE_SYNTAX_ERROR),
						 errmsg("there is no escaped character: \"%s\"",
								state->bufstart)));
			else
			{
				RESIZEPRSBUF;
				COPYCHAR(curpos, state->prsbuf);
				curpos += pg_mblen(state->prsbuf);
				Assert(oldstate != 0);
				statecode = oldstate;
			}
		}
		else if (statecode == WAITENDWORD)
		{
			if (t_iseq(state->prsbuf, '\\'))
			{
				statecode = WAITNEXTCHAR;
				oldstate = WAITENDWORD;
			}
			else if (t_isspace(state->prsbuf) || *(state->prsbuf) == '\0' ||
					 (state->oprisdelim && ISOPERATOR(state->prsbuf)))
			{
				RESIZEPRSBUF;
				if (curpos == state->word)
					PRSSYNTAXERROR;
				*(curpos) = '\0';
				RETURN_TOKEN;
			}
			else if (t_iseq(state->prsbuf, ':'))
			{
				if (curpos == state->word)
					PRSSYNTAXERROR;
				*(curpos) = '\0';
				if (state->oprisdelim)
					RETURN_TOKEN;
				else
					statecode = INPOSINFO;
			}
			else
			{
				RESIZEPRSBUF;
				COPYCHAR(curpos, state->prsbuf);
				curpos += pg_mblen(state->prsbuf);
			}
		}
		else if (statecode == WAITENDCMPLX)
		{
			if (t_iseq(state->prsbuf, '\''))
			{
				statecode = WAITCHARCMPLX;
			}
			else if (t_iseq(state->prsbuf, '\\'))
			{
				statecode = WAITNEXTCHAR;
				oldstate = WAITENDCMPLX;
			}
			else if (*(state->prsbuf) == '\0')
				PRSSYNTAXERROR;
			else
			{
				RESIZEPRSBUF;
				COPYCHAR(curpos, state->prsbuf);
				curpos += pg_mblen(state->prsbuf);
			}
		}
		else if (statecode == WAITCHARCMPLX)
		{
			if (t_iseq(state->prsbuf, '\''))
			{
				RESIZEPRSBUF;
				COPYCHAR(curpos, state->prsbuf);
				curpos += pg_mblen(state->prsbuf);
				statecode = WAITENDCMPLX;
			}
			else
			{
				RESIZEPRSBUF;
				*(curpos) = '\0';
				if (curpos == state->word)
					PRSSYNTAXERROR;
				if (state->oprisdelim)
				{
					/* state->prsbuf+=pg_mblen(state->prsbuf); */
					RETURN_TOKEN;
				}
				else
					statecode = WAITPOSINFO;
				continue;		/* recheck current character */
			}
		}
		else if (statecode == WAITPOSINFO)
		{
			if (t_iseq(state->prsbuf, ':'))
				statecode = INPOSINFO;
			else
				RETURN_TOKEN;
		}
		else if (statecode == INPOSINFO)
		{
			if (t_isdigit(state->prsbuf))
			{
				if (posalen == 0)
				{
					posalen = 4;
					pos = (WordEntryPos *) palloc(sizeof(WordEntryPos) * posalen);
					npos = 0;
				}
				else if (npos + 1 >= posalen)
				{
					posalen *= 2;
					pos = (WordEntryPos *) repalloc(pos, sizeof(WordEntryPos) * posalen);
				}
				npos++;
				WEP_SETPOS(pos[npos - 1], LIMITPOS(atoi(state->prsbuf)));
				/* we cannot get here in tsquery, so no need for 2 errmsgs */
				if (WEP_GETPOS(pos[npos - 1]) == 0)
					ereport(ERROR,
							(errcode(ERRCODE_SYNTAX_ERROR),
							 errmsg("wrong position info in tsvector: \"%s\"",
									state->bufstart)));
				WEP_SETWEIGHT(pos[npos - 1], 0);
				statecode = WAITPOSDELIM;
			}
			else
				PRSSYNTAXERROR;
		}
		else if (statecode == WAITPOSDELIM)
		{
			if (t_iseq(state->prsbuf, ','))
				statecode = INPOSINFO;
			else if (t_iseq(state->prsbuf, 'a') || t_iseq(state->prsbuf, 'A') || t_iseq(state->prsbuf, '*'))
			{
				if (WEP_GETWEIGHT(pos[npos - 1]))
					PRSSYNTAXERROR;
				WEP_SETWEIGHT(pos[npos - 1], 3);
			}
			else if (t_iseq(state->prsbuf, 'b') || t_iseq(state->prsbuf, 'B'))
			{
				if (WEP_GETWEIGHT(pos[npos - 1]))
					PRSSYNTAXERROR;
				WEP_SETWEIGHT(pos[npos - 1], 2);
			}
			else if (t_iseq(state->prsbuf, 'c') || t_iseq(state->prsbuf, 'C'))
			{
				if (WEP_GETWEIGHT(pos[npos - 1]))
					PRSSYNTAXERROR;
				WEP_SETWEIGHT(pos[npos - 1], 1);
			}
			else if (t_iseq(state->prsbuf, 'd') || t_iseq(state->prsbuf, 'D'))
			{
				if (WEP_GETWEIGHT(pos[npos - 1]))
					PRSSYNTAXERROR;
				WEP_SETWEIGHT(pos[npos - 1], 0);
			}
			else if (t_isspace(state->prsbuf) ||
					 *(state->prsbuf) == '\0')
				RETURN_TOKEN;
			else if (!t_isdigit(state->prsbuf))
				PRSSYNTAXERROR;
		}
		else	/* internal error */
			elog(ERROR, "unrecognized state in gettoken_tsvector: %d",
				 statecode);

		/* get next char */
		state->prsbuf += pg_mblen(state->prsbuf);
	}
}