mirror of
https://github.com/postgres/postgres.git
synced 2025-05-08 07:21:33 +03:00
After receiving position data for a lexeme, tsvectorrecv() advanced its "datalen" value by (npos+1)*sizeof(WordEntry) where the correct calculation is (npos+1)*sizeof(WordEntryPos). This accidentally failed to render the constructed tsvector invalid, but it did result in leaving some wasted space approximately equal to the space consumed by the position data. That could have several bad effects: * Disk space is wasted if the received tsvector is stored into a table as-is. * A legal tsvector could get rejected with "maximum total lexeme length exceeded" if the extra space pushes it over the MAXSTRPOS limit. * In edge cases, the finished tsvector could be assigned a length larger than the allocated size of its palloc chunk, conceivably leading to SIGSEGV when the tsvector gets copied somewhere else. The odds of a field failure of this sort seem low, though valgrind testing could probably have found this. While we're here, let's express the calculation as "sizeof(uint16) + npos * sizeof(WordEntryPos)" to avoid the type pun implicit in the "npos + 1" formulation. It's not wrong given that WordEntryPos had better be 2 bytes to avoid padding problems, but it seems clearer this way. Report and patch by Denis Erokhin. Back-patch to all supported versions. Discussion: https://postgr.es/m/009801d9f2d9$f29730c0$d7c59240$@datagile.ru
552 lines
12 KiB
C
552 lines
12 KiB
C
/*-------------------------------------------------------------------------
|
|
*
|
|
* tsvector.c
|
|
* I/O functions for tsvector
|
|
*
|
|
* Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
|
|
*
|
|
*
|
|
* IDENTIFICATION
|
|
* src/backend/utils/adt/tsvector.c
|
|
*
|
|
*-------------------------------------------------------------------------
|
|
*/
|
|
|
|
#include "postgres.h"
|
|
|
|
#include "libpq/pqformat.h"
|
|
#include "tsearch/ts_locale.h"
|
|
#include "tsearch/ts_utils.h"
|
|
#include "utils/builtins.h"
|
|
#include "utils/memutils.h"
|
|
|
|
typedef struct
|
|
{
|
|
WordEntry entry; /* must be first! */
|
|
WordEntryPos *pos;
|
|
int poslen; /* number of elements in pos */
|
|
} WordEntryIN;
|
|
|
|
|
|
/* Compare two WordEntryPos values for qsort */
|
|
int
|
|
compareWordEntryPos(const void *a, const void *b)
|
|
{
|
|
int apos = WEP_GETPOS(*(const WordEntryPos *) a);
|
|
int bpos = WEP_GETPOS(*(const WordEntryPos *) b);
|
|
|
|
if (apos == bpos)
|
|
return 0;
|
|
return (apos > bpos) ? 1 : -1;
|
|
}
|
|
|
|
/*
|
|
* Removes duplicate pos entries. If there's two entries with same pos but
|
|
* different weight, the higher weight is retained, so we can't use
|
|
* qunique here.
|
|
*
|
|
* Returns new length.
|
|
*/
|
|
static int
|
|
uniquePos(WordEntryPos *a, int l)
|
|
{
|
|
WordEntryPos *ptr,
|
|
*res;
|
|
|
|
if (l <= 1)
|
|
return l;
|
|
|
|
qsort((void *) a, l, sizeof(WordEntryPos), compareWordEntryPos);
|
|
|
|
res = a;
|
|
ptr = a + 1;
|
|
while (ptr - a < l)
|
|
{
|
|
if (WEP_GETPOS(*ptr) != WEP_GETPOS(*res))
|
|
{
|
|
res++;
|
|
*res = *ptr;
|
|
if (res - a >= MAXNUMPOS - 1 ||
|
|
WEP_GETPOS(*res) == MAXENTRYPOS - 1)
|
|
break;
|
|
}
|
|
else if (WEP_GETWEIGHT(*ptr) > WEP_GETWEIGHT(*res))
|
|
WEP_SETWEIGHT(*res, WEP_GETWEIGHT(*ptr));
|
|
ptr++;
|
|
}
|
|
|
|
return res + 1 - a;
|
|
}
|
|
|
|
/* Compare two WordEntryIN values for qsort */
|
|
static int
|
|
compareentry(const void *va, const void *vb, void *arg)
|
|
{
|
|
const WordEntryIN *a = (const WordEntryIN *) va;
|
|
const WordEntryIN *b = (const WordEntryIN *) vb;
|
|
char *BufferStr = (char *) arg;
|
|
|
|
return tsCompareString(&BufferStr[a->entry.pos], a->entry.len,
|
|
&BufferStr[b->entry.pos], b->entry.len,
|
|
false);
|
|
}
|
|
|
|
/*
|
|
* Sort an array of WordEntryIN, remove duplicates.
|
|
* *outbuflen receives the amount of space needed for strings and positions.
|
|
*/
|
|
static int
|
|
uniqueentry(WordEntryIN *a, int l, char *buf, int *outbuflen)
|
|
{
|
|
int buflen;
|
|
WordEntryIN *ptr,
|
|
*res;
|
|
|
|
Assert(l >= 1);
|
|
|
|
if (l > 1)
|
|
qsort_arg((void *) a, l, sizeof(WordEntryIN), compareentry,
|
|
(void *) buf);
|
|
|
|
buflen = 0;
|
|
res = a;
|
|
ptr = a + 1;
|
|
while (ptr - a < l)
|
|
{
|
|
if (!(ptr->entry.len == res->entry.len &&
|
|
strncmp(&buf[ptr->entry.pos], &buf[res->entry.pos],
|
|
res->entry.len) == 0))
|
|
{
|
|
/* done accumulating data into *res, count space needed */
|
|
buflen += res->entry.len;
|
|
if (res->entry.haspos)
|
|
{
|
|
res->poslen = uniquePos(res->pos, res->poslen);
|
|
buflen = SHORTALIGN(buflen);
|
|
buflen += res->poslen * sizeof(WordEntryPos) + sizeof(uint16);
|
|
}
|
|
res++;
|
|
if (res != ptr)
|
|
memcpy(res, ptr, sizeof(WordEntryIN));
|
|
}
|
|
else if (ptr->entry.haspos)
|
|
{
|
|
if (res->entry.haspos)
|
|
{
|
|
/* append ptr's positions to res's positions */
|
|
int newlen = ptr->poslen + res->poslen;
|
|
|
|
res->pos = (WordEntryPos *)
|
|
repalloc(res->pos, newlen * sizeof(WordEntryPos));
|
|
memcpy(&res->pos[res->poslen], ptr->pos,
|
|
ptr->poslen * sizeof(WordEntryPos));
|
|
res->poslen = newlen;
|
|
pfree(ptr->pos);
|
|
}
|
|
else
|
|
{
|
|
/* just give ptr's positions to pos */
|
|
res->entry.haspos = 1;
|
|
res->pos = ptr->pos;
|
|
res->poslen = ptr->poslen;
|
|
}
|
|
}
|
|
ptr++;
|
|
}
|
|
|
|
/* count space needed for last item */
|
|
buflen += res->entry.len;
|
|
if (res->entry.haspos)
|
|
{
|
|
res->poslen = uniquePos(res->pos, res->poslen);
|
|
buflen = SHORTALIGN(buflen);
|
|
buflen += res->poslen * sizeof(WordEntryPos) + sizeof(uint16);
|
|
}
|
|
|
|
*outbuflen = buflen;
|
|
return res + 1 - a;
|
|
}
|
|
|
|
static int
|
|
WordEntryCMP(WordEntry *a, WordEntry *b, char *buf)
|
|
{
|
|
return compareentry(a, b, buf);
|
|
}
|
|
|
|
|
|
Datum
|
|
tsvectorin(PG_FUNCTION_ARGS)
|
|
{
|
|
char *buf = PG_GETARG_CSTRING(0);
|
|
TSVectorParseState state;
|
|
WordEntryIN *arr;
|
|
int totallen;
|
|
int arrlen; /* allocated size of arr */
|
|
WordEntry *inarr;
|
|
int len = 0;
|
|
TSVector in;
|
|
int i;
|
|
char *token;
|
|
int toklen;
|
|
WordEntryPos *pos;
|
|
int poslen;
|
|
char *strbuf;
|
|
int stroff;
|
|
|
|
/*
|
|
* Tokens are appended to tmpbuf, cur is a pointer to the end of used
|
|
* space in tmpbuf.
|
|
*/
|
|
char *tmpbuf;
|
|
char *cur;
|
|
int buflen = 256; /* allocated size of tmpbuf */
|
|
|
|
state = init_tsvector_parser(buf, 0);
|
|
|
|
arrlen = 64;
|
|
arr = (WordEntryIN *) palloc(sizeof(WordEntryIN) * arrlen);
|
|
cur = tmpbuf = (char *) palloc(buflen);
|
|
|
|
while (gettoken_tsvector(state, &token, &toklen, &pos, &poslen, NULL))
|
|
{
|
|
if (toklen >= MAXSTRLEN)
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
|
|
errmsg("word is too long (%ld bytes, max %ld bytes)",
|
|
(long) toklen,
|
|
(long) (MAXSTRLEN - 1))));
|
|
|
|
if (cur - tmpbuf > MAXSTRPOS)
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
|
|
errmsg("string is too long for tsvector (%ld bytes, max %ld bytes)",
|
|
(long) (cur - tmpbuf), (long) MAXSTRPOS)));
|
|
|
|
/*
|
|
* Enlarge buffers if needed
|
|
*/
|
|
if (len >= arrlen)
|
|
{
|
|
arrlen *= 2;
|
|
arr = (WordEntryIN *)
|
|
repalloc((void *) arr, sizeof(WordEntryIN) * arrlen);
|
|
}
|
|
while ((cur - tmpbuf) + toklen >= buflen)
|
|
{
|
|
int dist = cur - tmpbuf;
|
|
|
|
buflen *= 2;
|
|
tmpbuf = (char *) repalloc((void *) tmpbuf, buflen);
|
|
cur = tmpbuf + dist;
|
|
}
|
|
arr[len].entry.len = toklen;
|
|
arr[len].entry.pos = cur - tmpbuf;
|
|
memcpy((void *) cur, (void *) token, toklen);
|
|
cur += toklen;
|
|
|
|
if (poslen != 0)
|
|
{
|
|
arr[len].entry.haspos = 1;
|
|
arr[len].pos = pos;
|
|
arr[len].poslen = poslen;
|
|
}
|
|
else
|
|
{
|
|
arr[len].entry.haspos = 0;
|
|
arr[len].pos = NULL;
|
|
arr[len].poslen = 0;
|
|
}
|
|
len++;
|
|
}
|
|
|
|
close_tsvector_parser(state);
|
|
|
|
if (len > 0)
|
|
len = uniqueentry(arr, len, tmpbuf, &buflen);
|
|
else
|
|
buflen = 0;
|
|
|
|
if (buflen > MAXSTRPOS)
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
|
|
errmsg("string is too long for tsvector (%d bytes, max %d bytes)", buflen, MAXSTRPOS)));
|
|
|
|
totallen = CALCDATASIZE(len, buflen);
|
|
in = (TSVector) palloc0(totallen);
|
|
SET_VARSIZE(in, totallen);
|
|
in->size = len;
|
|
inarr = ARRPTR(in);
|
|
strbuf = STRPTR(in);
|
|
stroff = 0;
|
|
for (i = 0; i < len; i++)
|
|
{
|
|
memcpy(strbuf + stroff, &tmpbuf[arr[i].entry.pos], arr[i].entry.len);
|
|
arr[i].entry.pos = stroff;
|
|
stroff += arr[i].entry.len;
|
|
if (arr[i].entry.haspos)
|
|
{
|
|
if (arr[i].poslen > 0xFFFF)
|
|
elog(ERROR, "positions array too long");
|
|
|
|
/* Copy number of positions */
|
|
stroff = SHORTALIGN(stroff);
|
|
*(uint16 *) (strbuf + stroff) = (uint16) arr[i].poslen;
|
|
stroff += sizeof(uint16);
|
|
|
|
/* Copy positions */
|
|
memcpy(strbuf + stroff, arr[i].pos, arr[i].poslen * sizeof(WordEntryPos));
|
|
stroff += arr[i].poslen * sizeof(WordEntryPos);
|
|
|
|
pfree(arr[i].pos);
|
|
}
|
|
inarr[i] = arr[i].entry;
|
|
}
|
|
|
|
Assert((strbuf + stroff - (char *) in) == totallen);
|
|
|
|
PG_RETURN_TSVECTOR(in);
|
|
}
|
|
|
|
Datum
|
|
tsvectorout(PG_FUNCTION_ARGS)
|
|
{
|
|
TSVector out = PG_GETARG_TSVECTOR(0);
|
|
char *outbuf;
|
|
int32 i,
|
|
lenbuf = 0,
|
|
pp;
|
|
WordEntry *ptr = ARRPTR(out);
|
|
char *curbegin,
|
|
*curin,
|
|
*curout;
|
|
|
|
lenbuf = out->size * 2 /* '' */ + out->size - 1 /* space */ + 2 /* \0 */ ;
|
|
for (i = 0; i < out->size; i++)
|
|
{
|
|
lenbuf += ptr[i].len * 2 * pg_database_encoding_max_length() /* for escape */ ;
|
|
if (ptr[i].haspos)
|
|
lenbuf += 1 /* : */ + 7 /* int2 + , + weight */ * POSDATALEN(out, &(ptr[i]));
|
|
}
|
|
|
|
curout = outbuf = (char *) palloc(lenbuf);
|
|
for (i = 0; i < out->size; i++)
|
|
{
|
|
curbegin = curin = STRPTR(out) + ptr->pos;
|
|
if (i != 0)
|
|
*curout++ = ' ';
|
|
*curout++ = '\'';
|
|
while (curin - curbegin < ptr->len)
|
|
{
|
|
int len = pg_mblen(curin);
|
|
|
|
if (t_iseq(curin, '\''))
|
|
*curout++ = '\'';
|
|
else if (t_iseq(curin, '\\'))
|
|
*curout++ = '\\';
|
|
|
|
while (len--)
|
|
*curout++ = *curin++;
|
|
}
|
|
|
|
*curout++ = '\'';
|
|
if ((pp = POSDATALEN(out, ptr)) != 0)
|
|
{
|
|
WordEntryPos *wptr;
|
|
|
|
*curout++ = ':';
|
|
wptr = POSDATAPTR(out, ptr);
|
|
while (pp)
|
|
{
|
|
curout += sprintf(curout, "%d", WEP_GETPOS(*wptr));
|
|
switch (WEP_GETWEIGHT(*wptr))
|
|
{
|
|
case 3:
|
|
*curout++ = 'A';
|
|
break;
|
|
case 2:
|
|
*curout++ = 'B';
|
|
break;
|
|
case 1:
|
|
*curout++ = 'C';
|
|
break;
|
|
case 0:
|
|
default:
|
|
break;
|
|
}
|
|
|
|
if (pp > 1)
|
|
*curout++ = ',';
|
|
pp--;
|
|
wptr++;
|
|
}
|
|
}
|
|
ptr++;
|
|
}
|
|
|
|
*curout = '\0';
|
|
PG_FREE_IF_COPY(out, 0);
|
|
PG_RETURN_CSTRING(outbuf);
|
|
}
|
|
|
|
/*
|
|
* Binary Input / Output functions. The binary format is as follows:
|
|
*
|
|
* uint32 number of lexemes
|
|
*
|
|
* for each lexeme:
|
|
* lexeme text in client encoding, null-terminated
|
|
* uint16 number of positions
|
|
* for each position:
|
|
* uint16 WordEntryPos
|
|
*/
|
|
|
|
Datum
|
|
tsvectorsend(PG_FUNCTION_ARGS)
|
|
{
|
|
TSVector vec = PG_GETARG_TSVECTOR(0);
|
|
StringInfoData buf;
|
|
int i,
|
|
j;
|
|
WordEntry *weptr = ARRPTR(vec);
|
|
|
|
pq_begintypsend(&buf);
|
|
|
|
pq_sendint32(&buf, vec->size);
|
|
for (i = 0; i < vec->size; i++)
|
|
{
|
|
uint16 npos;
|
|
|
|
/*
|
|
* the strings in the TSVector array are not null-terminated, so we
|
|
* have to send the null-terminator separately
|
|
*/
|
|
pq_sendtext(&buf, STRPTR(vec) + weptr->pos, weptr->len);
|
|
pq_sendbyte(&buf, '\0');
|
|
|
|
npos = POSDATALEN(vec, weptr);
|
|
pq_sendint16(&buf, npos);
|
|
|
|
if (npos > 0)
|
|
{
|
|
WordEntryPos *wepptr = POSDATAPTR(vec, weptr);
|
|
|
|
for (j = 0; j < npos; j++)
|
|
pq_sendint16(&buf, wepptr[j]);
|
|
}
|
|
weptr++;
|
|
}
|
|
|
|
PG_RETURN_BYTEA_P(pq_endtypsend(&buf));
|
|
}
|
|
|
|
Datum
|
|
tsvectorrecv(PG_FUNCTION_ARGS)
|
|
{
|
|
StringInfo buf = (StringInfo) PG_GETARG_POINTER(0);
|
|
TSVector vec;
|
|
int i;
|
|
int32 nentries;
|
|
int datalen; /* number of bytes used in the variable size
|
|
* area after fixed size TSVector header and
|
|
* WordEntries */
|
|
Size hdrlen;
|
|
Size len; /* allocated size of vec */
|
|
bool needSort = false;
|
|
|
|
nentries = pq_getmsgint(buf, sizeof(int32));
|
|
if (nentries < 0 || nentries > (MaxAllocSize / sizeof(WordEntry)))
|
|
elog(ERROR, "invalid size of tsvector");
|
|
|
|
hdrlen = DATAHDRSIZE + sizeof(WordEntry) * nentries;
|
|
|
|
len = hdrlen * 2; /* times two to make room for lexemes */
|
|
vec = (TSVector) palloc0(len);
|
|
vec->size = nentries;
|
|
|
|
datalen = 0;
|
|
for (i = 0; i < nentries; i++)
|
|
{
|
|
const char *lexeme;
|
|
uint16 npos;
|
|
size_t lex_len;
|
|
|
|
lexeme = pq_getmsgstring(buf);
|
|
npos = (uint16) pq_getmsgint(buf, sizeof(uint16));
|
|
|
|
/* sanity checks */
|
|
|
|
lex_len = strlen(lexeme);
|
|
if (lex_len > MAXSTRLEN)
|
|
elog(ERROR, "invalid tsvector: lexeme too long");
|
|
|
|
if (datalen > MAXSTRPOS)
|
|
elog(ERROR, "invalid tsvector: maximum total lexeme length exceeded");
|
|
|
|
if (npos > MAXNUMPOS)
|
|
elog(ERROR, "unexpected number of tsvector positions");
|
|
|
|
/*
|
|
* Looks valid. Fill the WordEntry struct, and copy lexeme.
|
|
*
|
|
* But make sure the buffer is large enough first.
|
|
*/
|
|
while (hdrlen + SHORTALIGN(datalen + lex_len) +
|
|
sizeof(uint16) + npos * sizeof(WordEntryPos) >= len)
|
|
{
|
|
len *= 2;
|
|
vec = (TSVector) repalloc(vec, len);
|
|
}
|
|
|
|
vec->entries[i].haspos = (npos > 0) ? 1 : 0;
|
|
vec->entries[i].len = lex_len;
|
|
vec->entries[i].pos = datalen;
|
|
|
|
memcpy(STRPTR(vec) + datalen, lexeme, lex_len);
|
|
|
|
datalen += lex_len;
|
|
|
|
if (i > 0 && WordEntryCMP(&vec->entries[i],
|
|
&vec->entries[i - 1],
|
|
STRPTR(vec)) <= 0)
|
|
needSort = true;
|
|
|
|
/* Receive positions */
|
|
if (npos > 0)
|
|
{
|
|
uint16 j;
|
|
WordEntryPos *wepptr;
|
|
|
|
/*
|
|
* Pad to 2-byte alignment if necessary. Though we used palloc0
|
|
* for the initial allocation, subsequent repalloc'd memory areas
|
|
* are not initialized to zero.
|
|
*/
|
|
if (datalen != SHORTALIGN(datalen))
|
|
{
|
|
*(STRPTR(vec) + datalen) = '\0';
|
|
datalen = SHORTALIGN(datalen);
|
|
}
|
|
|
|
memcpy(STRPTR(vec) + datalen, &npos, sizeof(uint16));
|
|
|
|
wepptr = POSDATAPTR(vec, &vec->entries[i]);
|
|
for (j = 0; j < npos; j++)
|
|
{
|
|
wepptr[j] = (WordEntryPos) pq_getmsgint(buf, sizeof(WordEntryPos));
|
|
if (j > 0 && WEP_GETPOS(wepptr[j]) <= WEP_GETPOS(wepptr[j - 1]))
|
|
elog(ERROR, "position information is misordered");
|
|
}
|
|
|
|
datalen += sizeof(uint16) + npos * sizeof(WordEntryPos);
|
|
}
|
|
}
|
|
|
|
SET_VARSIZE(vec, hdrlen + datalen);
|
|
|
|
if (needSort)
|
|
qsort_arg((void *) ARRPTR(vec), vec->size, sizeof(WordEntry),
|
|
compareentry, (void *) STRPTR(vec));
|
|
|
|
PG_RETURN_TSVECTOR(vec);
|
|
}
|