1
0
mirror of https://github.com/postgres/postgres.git synced 2025-04-20 00:42:27 +03:00
postgres/src/backend/utils/adt/tsvector_parser.c
Peter Eisentraut dd16f9480a Remove unreachable code
The Solaris Studio compiler warns about these instances, unlike more
mainstream compilers such as gcc.  But manual inspection showed that
the code is clearly not reachable, and we hope no worthy compiler will
complain about removing this code.
2012-07-16 22:15:03 +03:00

366 lines
8.9 KiB
C

/*-------------------------------------------------------------------------
*
* tsvector_parser.c
* Parser for tsvector
*
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
*
*
* IDENTIFICATION
* src/backend/utils/adt/tsvector_parser.c
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "tsearch/ts_locale.h"
#include "tsearch/ts_utils.h"
/*
* Private state of tsvector parser. Note that tsquery also uses this code to
* parse its input, hence the boolean flags. The two flags are both true or
* both false in current usage, but we keep them separate for clarity.
* is_tsquery affects *only* the content of error messages.
*/
struct TSVectorParseStateData
{
char *prsbuf; /* next input character */
char *bufstart; /* whole string (used only for errors) */
char *word; /* buffer to hold the current word */
int len; /* size in bytes allocated for 'word' */
int eml; /* max bytes per character */
bool oprisdelim; /* treat ! | * ( ) as delimiters? */
bool is_tsquery; /* say "tsquery" not "tsvector" in errors? */
};
/*
* Initializes parser for the input string. If oprisdelim is set, the
* following characters are treated as delimiters in addition to whitespace:
* ! | & ( )
*/
TSVectorParseState
init_tsvector_parser(char *input, bool oprisdelim, bool is_tsquery)
{
TSVectorParseState state;
state = (TSVectorParseState) palloc(sizeof(struct TSVectorParseStateData));
state->prsbuf = input;
state->bufstart = input;
state->len = 32;
state->word = (char *) palloc(state->len);
state->eml = pg_database_encoding_max_length();
state->oprisdelim = oprisdelim;
state->is_tsquery = is_tsquery;
return state;
}
/*
* Reinitializes parser to parse 'input', instead of previous input.
*/
void
reset_tsvector_parser(TSVectorParseState state, char *input)
{
state->prsbuf = input;
}
/*
* Shuts down a tsvector parser.
*/
void
close_tsvector_parser(TSVectorParseState state)
{
pfree(state->word);
pfree(state);
}
/* increase the size of 'word' if needed to hold one more character */
#define RESIZEPRSBUF \
do { \
int clen = curpos - state->word; \
if ( clen + state->eml >= state->len ) \
{ \
state->len *= 2; \
state->word = (char *) repalloc(state->word, state->len); \
curpos = state->word + clen; \
} \
} while (0)
#define ISOPERATOR(x) ( pg_mblen(x)==1 && ( *(x)=='!' || *(x)=='&' || *(x)=='|' || *(x)=='(' || *(x)==')' ) )
/* Fills gettoken_tsvector's output parameters, and returns true */
#define RETURN_TOKEN \
do { \
if (pos_ptr != NULL) \
{ \
*pos_ptr = pos; \
*poslen = npos; \
} \
else if (pos != NULL) \
pfree(pos); \
\
if (strval != NULL) \
*strval = state->word; \
if (lenval != NULL) \
*lenval = curpos - state->word; \
if (endptr != NULL) \
*endptr = state->prsbuf; \
return true; \
} while(0)
/* State codes used in gettoken_tsvector */
#define WAITWORD 1
#define WAITENDWORD 2
#define WAITNEXTCHAR 3
#define WAITENDCMPLX 4
#define WAITPOSINFO 5
#define INPOSINFO 6
#define WAITPOSDELIM 7
#define WAITCHARCMPLX 8
#define PRSSYNTAXERROR prssyntaxerror(state)
static void
prssyntaxerror(TSVectorParseState state)
{
ereport(ERROR,
(errcode(ERRCODE_SYNTAX_ERROR),
state->is_tsquery ?
errmsg("syntax error in tsquery: \"%s\"", state->bufstart) :
errmsg("syntax error in tsvector: \"%s\"", state->bufstart)));
}
/*
* Get next token from string being parsed. Returns true if successful,
* false if end of input string is reached. On success, these output
* parameters are filled in:
*
* *strval pointer to token
* *lenval length of *strval
* *pos_ptr pointer to a palloc'd array of positions and weights
* associated with the token. If the caller is not interested
* in the information, NULL can be supplied. Otherwise
* the caller is responsible for pfreeing the array.
* *poslen number of elements in *pos_ptr
* *endptr scan resumption point
*
* Pass NULL for unwanted output parameters.
*/
bool
gettoken_tsvector(TSVectorParseState state,
char **strval, int *lenval,
WordEntryPos **pos_ptr, int *poslen,
char **endptr)
{
int oldstate = 0;
char *curpos = state->word;
int statecode = WAITWORD;
/*
* pos is for collecting the comma delimited list of positions followed by
* the actual token.
*/
WordEntryPos *pos = NULL;
int npos = 0; /* elements of pos used */
int posalen = 0; /* allocated size of pos */
while (1)
{
if (statecode == WAITWORD)
{
if (*(state->prsbuf) == '\0')
return false;
else if (t_iseq(state->prsbuf, '\''))
statecode = WAITENDCMPLX;
else if (t_iseq(state->prsbuf, '\\'))
{
statecode = WAITNEXTCHAR;
oldstate = WAITENDWORD;
}
else if (state->oprisdelim && ISOPERATOR(state->prsbuf))
PRSSYNTAXERROR;
else if (!t_isspace(state->prsbuf))
{
COPYCHAR(curpos, state->prsbuf);
curpos += pg_mblen(state->prsbuf);
statecode = WAITENDWORD;
}
}
else if (statecode == WAITNEXTCHAR)
{
if (*(state->prsbuf) == '\0')
ereport(ERROR,
(errcode(ERRCODE_SYNTAX_ERROR),
errmsg("there is no escaped character: \"%s\"",
state->bufstart)));
else
{
RESIZEPRSBUF;
COPYCHAR(curpos, state->prsbuf);
curpos += pg_mblen(state->prsbuf);
Assert(oldstate != 0);
statecode = oldstate;
}
}
else if (statecode == WAITENDWORD)
{
if (t_iseq(state->prsbuf, '\\'))
{
statecode = WAITNEXTCHAR;
oldstate = WAITENDWORD;
}
else if (t_isspace(state->prsbuf) || *(state->prsbuf) == '\0' ||
(state->oprisdelim && ISOPERATOR(state->prsbuf)))
{
RESIZEPRSBUF;
if (curpos == state->word)
PRSSYNTAXERROR;
*(curpos) = '\0';
RETURN_TOKEN;
}
else if (t_iseq(state->prsbuf, ':'))
{
if (curpos == state->word)
PRSSYNTAXERROR;
*(curpos) = '\0';
if (state->oprisdelim)
RETURN_TOKEN;
else
statecode = INPOSINFO;
}
else
{
RESIZEPRSBUF;
COPYCHAR(curpos, state->prsbuf);
curpos += pg_mblen(state->prsbuf);
}
}
else if (statecode == WAITENDCMPLX)
{
if (t_iseq(state->prsbuf, '\''))
{
statecode = WAITCHARCMPLX;
}
else if (t_iseq(state->prsbuf, '\\'))
{
statecode = WAITNEXTCHAR;
oldstate = WAITENDCMPLX;
}
else if (*(state->prsbuf) == '\0')
PRSSYNTAXERROR;
else
{
RESIZEPRSBUF;
COPYCHAR(curpos, state->prsbuf);
curpos += pg_mblen(state->prsbuf);
}
}
else if (statecode == WAITCHARCMPLX)
{
if (t_iseq(state->prsbuf, '\''))
{
RESIZEPRSBUF;
COPYCHAR(curpos, state->prsbuf);
curpos += pg_mblen(state->prsbuf);
statecode = WAITENDCMPLX;
}
else
{
RESIZEPRSBUF;
*(curpos) = '\0';
if (curpos == state->word)
PRSSYNTAXERROR;
if (state->oprisdelim)
{
/* state->prsbuf+=pg_mblen(state->prsbuf); */
RETURN_TOKEN;
}
else
statecode = WAITPOSINFO;
continue; /* recheck current character */
}
}
else if (statecode == WAITPOSINFO)
{
if (t_iseq(state->prsbuf, ':'))
statecode = INPOSINFO;
else
RETURN_TOKEN;
}
else if (statecode == INPOSINFO)
{
if (t_isdigit(state->prsbuf))
{
if (posalen == 0)
{
posalen = 4;
pos = (WordEntryPos *) palloc(sizeof(WordEntryPos) * posalen);
npos = 0;
}
else if (npos + 1 >= posalen)
{
posalen *= 2;
pos = (WordEntryPos *) repalloc(pos, sizeof(WordEntryPos) * posalen);
}
npos++;
WEP_SETPOS(pos[npos - 1], LIMITPOS(atoi(state->prsbuf)));
/* we cannot get here in tsquery, so no need for 2 errmsgs */
if (WEP_GETPOS(pos[npos - 1]) == 0)
ereport(ERROR,
(errcode(ERRCODE_SYNTAX_ERROR),
errmsg("wrong position info in tsvector: \"%s\"",
state->bufstart)));
WEP_SETWEIGHT(pos[npos - 1], 0);
statecode = WAITPOSDELIM;
}
else
PRSSYNTAXERROR;
}
else if (statecode == WAITPOSDELIM)
{
if (t_iseq(state->prsbuf, ','))
statecode = INPOSINFO;
else if (t_iseq(state->prsbuf, 'a') || t_iseq(state->prsbuf, 'A') || t_iseq(state->prsbuf, '*'))
{
if (WEP_GETWEIGHT(pos[npos - 1]))
PRSSYNTAXERROR;
WEP_SETWEIGHT(pos[npos - 1], 3);
}
else if (t_iseq(state->prsbuf, 'b') || t_iseq(state->prsbuf, 'B'))
{
if (WEP_GETWEIGHT(pos[npos - 1]))
PRSSYNTAXERROR;
WEP_SETWEIGHT(pos[npos - 1], 2);
}
else if (t_iseq(state->prsbuf, 'c') || t_iseq(state->prsbuf, 'C'))
{
if (WEP_GETWEIGHT(pos[npos - 1]))
PRSSYNTAXERROR;
WEP_SETWEIGHT(pos[npos - 1], 1);
}
else if (t_iseq(state->prsbuf, 'd') || t_iseq(state->prsbuf, 'D'))
{
if (WEP_GETWEIGHT(pos[npos - 1]))
PRSSYNTAXERROR;
WEP_SETWEIGHT(pos[npos - 1], 0);
}
else if (t_isspace(state->prsbuf) ||
*(state->prsbuf) == '\0')
RETURN_TOKEN;
else if (!t_isdigit(state->prsbuf))
PRSSYNTAXERROR;
}
else /* internal error */
elog(ERROR, "unrecognized state in gettoken_tsvector: %d",
statecode);
/* get next char */
state->prsbuf += pg_mblen(state->prsbuf);
}
}