1
0
mirror of https://github.com/postgres/postgres.git synced 2025-06-30 21:42:05 +03:00

Tsearch2 functionality migrates to core. The bulk of this work is by

Oleg Bartunov and Teodor Sigaev, but I did a lot of editorializing,
so anything that's broken is probably my fault.

Documentation is nonexistent as yet, but let's land the patch so we can
get some portability testing done.
This commit is contained in:
Tom Lane
2007-08-21 01:11:32 +00:00
parent 4e94d1f952
commit 140d4ebcb4
200 changed files with 54388 additions and 147 deletions

View File

@ -0,0 +1,683 @@
/*-------------------------------------------------------------------------
*
* tsvector.c
* I/O functions for tsvector
*
* Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
*
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/utils/adt/tsvector.c,v 1.1 2007/08/21 01:11:19 tgl Exp $
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "libpq/pqformat.h"
#include "tsearch/ts_type.h"
#include "tsearch/ts_locale.h"
#include "tsearch/ts_utils.h"
#include "utils/memutils.h"
static int
comparePos(const void *a, const void *b)
{
if (WEP_GETPOS(*(WordEntryPos *) a) == WEP_GETPOS(*(WordEntryPos *) b))
return 0;
return (WEP_GETPOS(*(WordEntryPos *) a) > WEP_GETPOS(*(WordEntryPos *) b)) ? 1 : -1;
}
static int
uniquePos(WordEntryPos * a, int4 l)
{
WordEntryPos *ptr,
*res;
if (l == 1)
return l;
res = a;
qsort((void *) a, l, sizeof(WordEntryPos), comparePos);
ptr = a + 1;
while (ptr - a < l)
{
if (WEP_GETPOS(*ptr) != WEP_GETPOS(*res))
{
res++;
*res = *ptr;
if (res - a >= MAXNUMPOS - 1 || WEP_GETPOS(*res) == MAXENTRYPOS - 1)
break;
}
else if (WEP_GETWEIGHT(*ptr) > WEP_GETWEIGHT(*res))
WEP_SETWEIGHT(*res, WEP_GETWEIGHT(*ptr));
ptr++;
}
return res + 1 - a;
}
static int
compareentry(const void *a, const void *b, void *arg)
{
char *BufferStr = (char *) arg;
if (((WordEntryIN *) a)->entry.len == ((WordEntryIN *) b)->entry.len)
{
return strncmp(&BufferStr[((WordEntryIN *) a)->entry.pos],
&BufferStr[((WordEntryIN *) b)->entry.pos],
((WordEntryIN *) a)->entry.len);
}
return (((WordEntryIN *) a)->entry.len > ((WordEntryIN *) b)->entry.len) ? 1 : -1;
}
static int
uniqueentry(WordEntryIN * a, int4 l, char *buf, int4 *outbuflen)
{
WordEntryIN *ptr,
*res;
res = a;
if (l == 1)
{
if (a->entry.haspos)
{
*(uint16 *) (a->pos) = uniquePos(&(a->pos[1]), *(uint16 *) (a->pos));
*outbuflen = SHORTALIGN(res->entry.len) + (*(uint16 *) (a->pos) + 1) * sizeof(WordEntryPos);
}
return l;
}
ptr = a + 1;
qsort_arg((void *) a, l, sizeof(WordEntryIN), compareentry, (void *) buf);
while (ptr - a < l)
{
if (!(ptr->entry.len == res->entry.len &&
strncmp(&buf[ptr->entry.pos], &buf[res->entry.pos], res->entry.len) == 0))
{
if (res->entry.haspos)
{
*(uint16 *) (res->pos) = uniquePos(&(res->pos[1]), *(uint16 *) (res->pos));
*outbuflen += *(uint16 *) (res->pos) * sizeof(WordEntryPos);
}
*outbuflen += SHORTALIGN(res->entry.len);
res++;
memcpy(res, ptr, sizeof(WordEntryIN));
}
else if (ptr->entry.haspos)
{
if (res->entry.haspos)
{
int4 len = *(uint16 *) (ptr->pos) + 1 + *(uint16 *) (res->pos);
res->pos = (WordEntryPos *) repalloc(res->pos, len * sizeof(WordEntryPos));
memcpy(&(res->pos[*(uint16 *) (res->pos) + 1]),
&(ptr->pos[1]), *(uint16 *) (ptr->pos) * sizeof(WordEntryPos));
*(uint16 *) (res->pos) += *(uint16 *) (ptr->pos);
pfree(ptr->pos);
}
else
{
res->entry.haspos = 1;
res->pos = ptr->pos;
}
}
ptr++;
}
if (res->entry.haspos)
{
*(uint16 *) (res->pos) = uniquePos(&(res->pos[1]), *(uint16 *) (res->pos));
*outbuflen += *(uint16 *) (res->pos) * sizeof(WordEntryPos);
}
*outbuflen += SHORTALIGN(res->entry.len);
return res + 1 - a;
}
static int
WordEntryCMP(WordEntry * a, WordEntry * b, char *buf)
{
return compareentry(a, b, buf);
}
#define WAITWORD 1
#define WAITENDWORD 2
#define WAITNEXTCHAR 3
#define WAITENDCMPLX 4
#define WAITPOSINFO 5
#define INPOSINFO 6
#define WAITPOSDELIM 7
#define WAITCHARCMPLX 8
#define RESIZEPRSBUF \
do { \
if ( state->curpos - state->word + pg_database_encoding_max_length() >= state->len ) \
{ \
int4 clen = state->curpos - state->word; \
state->len *= 2; \
state->word = (char*)repalloc( (void*)state->word, state->len ); \
state->curpos = state->word + clen; \
} \
} while (0)
bool
gettoken_tsvector(TSVectorParseState *state)
{
int4 oldstate = 0;
state->curpos = state->word;
state->state = WAITWORD;
state->alen = 0;
while (1)
{
if (state->state == WAITWORD)
{
if (*(state->prsbuf) == '\0')
return false;
else if (t_iseq(state->prsbuf, '\''))
state->state = WAITENDCMPLX;
else if (t_iseq(state->prsbuf, '\\'))
{
state->state = WAITNEXTCHAR;
oldstate = WAITENDWORD;
}
else if (state->oprisdelim && ISOPERATOR(state->prsbuf))
ereport(ERROR,
(errcode(ERRCODE_SYNTAX_ERROR),
errmsg("syntax error in tsvector")));
else if (!t_isspace(state->prsbuf))
{
COPYCHAR(state->curpos, state->prsbuf);
state->curpos += pg_mblen(state->prsbuf);
state->state = WAITENDWORD;
}
}
else if (state->state == WAITNEXTCHAR)
{
if (*(state->prsbuf) == '\0')
ereport(ERROR,
(errcode(ERRCODE_SYNTAX_ERROR),
errmsg("there is no escaped character")));
else
{
RESIZEPRSBUF;
COPYCHAR(state->curpos, state->prsbuf);
state->curpos += pg_mblen(state->prsbuf);
state->state = oldstate;
}
}
else if (state->state == WAITENDWORD)
{
if (t_iseq(state->prsbuf, '\\'))
{
state->state = WAITNEXTCHAR;
oldstate = WAITENDWORD;
}
else if (t_isspace(state->prsbuf) || *(state->prsbuf) == '\0' ||
(state->oprisdelim && ISOPERATOR(state->prsbuf)))
{
RESIZEPRSBUF;
if (state->curpos == state->word)
ereport(ERROR,
(errcode(ERRCODE_SYNTAX_ERROR),
errmsg("syntax error in tsvector")));
*(state->curpos) = '\0';
return true;
}
else if (t_iseq(state->prsbuf, ':'))
{
if (state->curpos == state->word)
ereport(ERROR,
(errcode(ERRCODE_SYNTAX_ERROR),
errmsg("syntax error in tsvector")));
*(state->curpos) = '\0';
if (state->oprisdelim)
return true;
else
state->state = INPOSINFO;
}
else
{
RESIZEPRSBUF;
COPYCHAR(state->curpos, state->prsbuf);
state->curpos += pg_mblen(state->prsbuf);
}
}
else if (state->state == WAITENDCMPLX)
{
if (t_iseq(state->prsbuf, '\''))
{
state->state = WAITCHARCMPLX;
}
else if (t_iseq(state->prsbuf, '\\'))
{
state->state = WAITNEXTCHAR;
oldstate = WAITENDCMPLX;
}
else if (*(state->prsbuf) == '\0')
ereport(ERROR,
(errcode(ERRCODE_SYNTAX_ERROR),
errmsg("syntax error in tsvector")));
else
{
RESIZEPRSBUF;
COPYCHAR(state->curpos, state->prsbuf);
state->curpos += pg_mblen(state->prsbuf);
}
}
else if (state->state == WAITCHARCMPLX)
{
if (t_iseq(state->prsbuf, '\''))
{
RESIZEPRSBUF;
COPYCHAR(state->curpos, state->prsbuf);
state->curpos += pg_mblen(state->prsbuf);
state->state = WAITENDCMPLX;
}
else
{
RESIZEPRSBUF;
*(state->curpos) = '\0';
if (state->curpos == state->word)
ereport(ERROR,
(errcode(ERRCODE_SYNTAX_ERROR),
errmsg("syntax error in tsvector")));
if (state->oprisdelim)
{
/* state->prsbuf+=pg_mblen(state->prsbuf); */
return true;
}
else
state->state = WAITPOSINFO;
continue; /* recheck current character */
}
}
else if (state->state == WAITPOSINFO)
{
if (t_iseq(state->prsbuf, ':'))
state->state = INPOSINFO;
else
return true;
}
else if (state->state == INPOSINFO)
{
if (t_isdigit(state->prsbuf))
{
if (state->alen == 0)
{
state->alen = 4;
state->pos = (WordEntryPos *) palloc(sizeof(WordEntryPos) * state->alen);
*(uint16 *) (state->pos) = 0;
}
else if (*(uint16 *) (state->pos) + 1 >= state->alen)
{
state->alen *= 2;
state->pos = (WordEntryPos *) repalloc(state->pos, sizeof(WordEntryPos) * state->alen);
}
(*(uint16 *) (state->pos))++;
WEP_SETPOS(state->pos[*(uint16 *) (state->pos)], LIMITPOS(atoi(state->prsbuf)));
if (WEP_GETPOS(state->pos[*(uint16 *) (state->pos)]) == 0)
ereport(ERROR,
(errcode(ERRCODE_SYNTAX_ERROR),
errmsg("wrong position info in tsvector")));
WEP_SETWEIGHT(state->pos[*(uint16 *) (state->pos)], 0);
state->state = WAITPOSDELIM;
}
else
ereport(ERROR,
(errcode(ERRCODE_SYNTAX_ERROR),
errmsg("syntax error in tsvector")));
}
else if (state->state == WAITPOSDELIM)
{
if (t_iseq(state->prsbuf, ','))
state->state = INPOSINFO;
else if (t_iseq(state->prsbuf, 'a') || t_iseq(state->prsbuf, 'A') || t_iseq(state->prsbuf, '*'))
{
if (WEP_GETWEIGHT(state->pos[*(uint16 *) (state->pos)]))
ereport(ERROR,
(errcode(ERRCODE_SYNTAX_ERROR),
errmsg("syntax error in tsvector")));
WEP_SETWEIGHT(state->pos[*(uint16 *) (state->pos)], 3);
}
else if (t_iseq(state->prsbuf, 'b') || t_iseq(state->prsbuf, 'B'))
{
if (WEP_GETWEIGHT(state->pos[*(uint16 *) (state->pos)]))
ereport(ERROR,
(errcode(ERRCODE_SYNTAX_ERROR),
errmsg("syntax error in tsvector")));
WEP_SETWEIGHT(state->pos[*(uint16 *) (state->pos)], 2);
}
else if (t_iseq(state->prsbuf, 'c') || t_iseq(state->prsbuf, 'C'))
{
if (WEP_GETWEIGHT(state->pos[*(uint16 *) (state->pos)]))
ereport(ERROR,
(errcode(ERRCODE_SYNTAX_ERROR),
errmsg("syntax error in tsvector")));
WEP_SETWEIGHT(state->pos[*(uint16 *) (state->pos)], 1);
}
else if (t_iseq(state->prsbuf, 'd') || t_iseq(state->prsbuf, 'D'))
{
if (WEP_GETWEIGHT(state->pos[*(uint16 *) (state->pos)]))
ereport(ERROR,
(errcode(ERRCODE_SYNTAX_ERROR),
errmsg("syntax error in tsvector")));
WEP_SETWEIGHT(state->pos[*(uint16 *) (state->pos)], 0);
}
else if (t_isspace(state->prsbuf) ||
*(state->prsbuf) == '\0')
return true;
else if (!t_isdigit(state->prsbuf))
ereport(ERROR,
(errcode(ERRCODE_SYNTAX_ERROR),
errmsg("syntax error in tsvector")));
}
else /* internal error */
elog(ERROR, "internal error in gettoken_tsvector");
/* get next char */
state->prsbuf += pg_mblen(state->prsbuf);
}
return false;
}
Datum
tsvectorin(PG_FUNCTION_ARGS)
{
char *buf = PG_GETARG_CSTRING(0);
TSVectorParseState state;
WordEntryIN *arr;
WordEntry *inarr;
int4 len = 0,
totallen = 64;
TSVector in;
char *tmpbuf,
*cur;
int4 i,
buflen = 256;
pg_verifymbstr(buf, strlen(buf), false);
state.prsbuf = buf;
state.len = 32;
state.word = (char *) palloc(state.len);
state.oprisdelim = false;
arr = (WordEntryIN *) palloc(sizeof(WordEntryIN) * totallen);
cur = tmpbuf = (char *) palloc(buflen);
while (gettoken_tsvector(&state))
{
/*
* Realloc buffers if it's needed
*/
if (len >= totallen)
{
totallen *= 2;
arr = (WordEntryIN *) repalloc((void *) arr, sizeof(WordEntryIN) * totallen);
}
while ((cur - tmpbuf) + (state.curpos - state.word) >= buflen)
{
int4 dist = cur - tmpbuf;
buflen *= 2;
tmpbuf = (char *) repalloc((void *) tmpbuf, buflen);
cur = tmpbuf + dist;
}
if (state.curpos - state.word >= MAXSTRLEN)
ereport(ERROR,
(errcode(ERRCODE_SYNTAX_ERROR),
errmsg("word is too long (%d bytes, max %d bytes)",
state.curpos - state.word, MAXSTRLEN)));
arr[len].entry.len = state.curpos - state.word;
if (cur - tmpbuf > MAXSTRPOS)
ereport(ERROR,
(errcode(ERRCODE_SYNTAX_ERROR),
errmsg("position value too large")));
arr[len].entry.pos = cur - tmpbuf;
memcpy((void *) cur, (void *) state.word, arr[len].entry.len);
cur += arr[len].entry.len;
if (state.alen)
{
arr[len].entry.haspos = 1;
arr[len].pos = state.pos;
}
else
arr[len].entry.haspos = 0;
len++;
}
pfree(state.word);
if (len > 0)
len = uniqueentry(arr, len, tmpbuf, &buflen);
else
buflen = 0;
totallen = CALCDATASIZE(len, buflen);
in = (TSVector) palloc0(totallen);
SET_VARSIZE(in, totallen);
in->size = len;
cur = STRPTR(in);
inarr = ARRPTR(in);
for (i = 0; i < len; i++)
{
memcpy((void *) cur, (void *) &tmpbuf[arr[i].entry.pos], arr[i].entry.len);
arr[i].entry.pos = cur - STRPTR(in);
cur += SHORTALIGN(arr[i].entry.len);
if (arr[i].entry.haspos)
{
memcpy(cur, arr[i].pos, (*(uint16 *) arr[i].pos + 1) * sizeof(WordEntryPos));
cur += (*(uint16 *) arr[i].pos + 1) * sizeof(WordEntryPos);
pfree(arr[i].pos);
}
inarr[i] = arr[i].entry;
}
PG_RETURN_TSVECTOR(in);
}
Datum
tsvectorout(PG_FUNCTION_ARGS)
{
TSVector out = PG_GETARG_TSVECTOR(0);
char *outbuf;
int4 i,
lenbuf = 0,
pp;
WordEntry *ptr = ARRPTR(out);
char *curbegin,
*curin,
*curout;
lenbuf = out->size * 2 /* '' */ + out->size - 1 /* space */ + 2 /* \0 */ ;
for (i = 0; i < out->size; i++)
{
lenbuf += ptr[i].len * 2 * pg_database_encoding_max_length() /* for escape */ ;
if (ptr[i].haspos)
lenbuf += 1 /* : */ + 7 /* int2 + , + weight */ * POSDATALEN(out, &(ptr[i]));
}
curout = outbuf = (char *) palloc(lenbuf);
for (i = 0; i < out->size; i++)
{
curbegin = curin = STRPTR(out) + ptr->pos;
if (i != 0)
*curout++ = ' ';
*curout++ = '\'';
while (curin - curbegin < ptr->len)
{
int len = pg_mblen(curin);
if (t_iseq(curin, '\''))
*curout++ = '\'';
while (len--)
*curout++ = *curin++;
}
*curout++ = '\'';
if ((pp = POSDATALEN(out, ptr)) != 0)
{
WordEntryPos *wptr;
*curout++ = ':';
wptr = POSDATAPTR(out, ptr);
while (pp)
{
curout += sprintf(curout, "%d", WEP_GETPOS(*wptr));
switch (WEP_GETWEIGHT(*wptr))
{
case 3:
*curout++ = 'A';
break;
case 2:
*curout++ = 'B';
break;
case 1:
*curout++ = 'C';
break;
case 0:
default:
break;
}
if (pp > 1)
*curout++ = ',';
pp--;
wptr++;
}
}
ptr++;
}
*curout = '\0';
PG_FREE_IF_COPY(out, 0);
PG_RETURN_CSTRING(outbuf);
}
Datum
tsvectorsend(PG_FUNCTION_ARGS)
{
TSVector vec = PG_GETARG_TSVECTOR(0);
StringInfoData buf;
int i,
j;
WordEntry *weptr = ARRPTR(vec);
pq_begintypsend(&buf);
pq_sendint(&buf, vec->size, sizeof(int32));
for (i = 0; i < vec->size; i++)
{
/*
* We are sure that sizeof(WordEntry) == sizeof(int32)
*/
pq_sendint(&buf, *(int32 *) weptr, sizeof(int32));
pq_sendbytes(&buf, STRPTR(vec) + weptr->pos, weptr->len);
if (weptr->haspos)
{
WordEntryPos *wepptr = POSDATAPTR(vec, weptr);
pq_sendint(&buf, POSDATALEN(vec, weptr), sizeof(WordEntryPos));
for (j = 0; j < POSDATALEN(vec, weptr); j++)
pq_sendint(&buf, wepptr[j], sizeof(WordEntryPos));
}
weptr++;
}
PG_RETURN_BYTEA_P(pq_endtypsend(&buf));
}
Datum
tsvectorrecv(PG_FUNCTION_ARGS)
{
StringInfo buf = (StringInfo) PG_GETARG_POINTER(0);
TSVector vec;
int i,
size,
len = DATAHDRSIZE;
WordEntry *weptr;
int datalen = 0;
size = pq_getmsgint(buf, sizeof(uint32));
if (size < 0 || size > (MaxAllocSize / sizeof(WordEntry)))
elog(ERROR, "invalid size of tsvector");
len += sizeof(WordEntry) * size;
len *= 2;
vec = (TSVector) palloc0(len);
vec->size = size;
weptr = ARRPTR(vec);
for (i = 0; i < size; i++)
{
int tmp;
weptr = ARRPTR(vec) + i;
/*
* We are sure that sizeof(WordEntry) == sizeof(int32)
*/
tmp = pq_getmsgint(buf, sizeof(int32));
*weptr = *(WordEntry *) & tmp;
while (CALCDATASIZE(size, datalen + SHORTALIGN(weptr->len)) >= len)
{
len *= 2;
vec = (TSVector) repalloc(vec, len);
weptr = ARRPTR(vec) + i;
}
memcpy(STRPTR(vec) + weptr->pos,
pq_getmsgbytes(buf, weptr->len),
weptr->len);
datalen += SHORTALIGN(weptr->len);
if (i > 0 && WordEntryCMP(weptr, weptr - 1, STRPTR(vec)) <= 0)
elog(ERROR, "lexemes are unordered");
if (weptr->haspos)
{
uint16 j,
npos;
WordEntryPos *wepptr;
npos = (uint16) pq_getmsgint(buf, sizeof(int16));
if (npos > MAXNUMPOS)
elog(ERROR, "unexpected number of positions");
while (CALCDATASIZE(size, datalen + (npos + 1) * sizeof(WordEntryPos)) >= len)
{
len *= 2;
vec = (TSVector) repalloc(vec, len);
weptr = ARRPTR(vec) + i;
}
memcpy(_POSDATAPTR(vec, weptr), &npos, sizeof(int16));
wepptr = POSDATAPTR(vec, weptr);
for (j = 0; j < npos; j++)
{
wepptr[j] = (WordEntryPos) pq_getmsgint(buf, sizeof(int16));
if (j > 0 && WEP_GETPOS(wepptr[j]) <= WEP_GETPOS(wepptr[j - 1]))
elog(ERROR, "position information is unordered");
}
datalen += (npos + 1) * sizeof(WordEntry);
}
}
SET_VARSIZE(vec, CALCDATASIZE(vec->size, datalen));
PG_RETURN_TSVECTOR(vec);
}