1
0
mirror of https://github.com/postgres/postgres.git synced 2025-11-19 13:42:17 +03:00
Files
postgres/src/backend/utils/adt/tsvector.c
Tom Lane a7187c3723 Remove unnecessary type violation in tsvectorrecv().
compareentry() is declared to work on WordEntryIN structs, but
tsvectorrecv() is using it in two places to work on WordEntry
structs.  This is almost okay, since WordEntry is the first
field of WordEntryIN.  But on machines with 8-byte pointers,
WordEntryIN will have a larger alignment spec than WordEntry,
and it's at least theoretically possible that the compiler
could generate code that depends on the larger alignment.

Given the lack of field reports, this may be just a hypothetical bug
that upsets nothing except sanitizer tools.  Or it may be real on
certain hardware but nobody's tried to use tsvectorrecv() on such
hardware.  In any case we should fix it, and the fix is trivial:
just change compareentry() so that it works on WordEntry without any
mention of WordEntryIN.  We can also get rid of the quite-useless
intermediate function WordEntryCMP.

Bug: #18875
Reported-by: Alexander Lakhin <exclusion@gmail.com>
Author: Tom Lane <tgl@sss.pgh.pa.us>
Discussion: https://postgr.es/m/18875-07a29c49c825a608@postgresql.org
Backpatch-through: 13
2025-04-02 16:17:43 -04:00

555 lines
12 KiB
C

/*-------------------------------------------------------------------------
*
* tsvector.c
* I/O functions for tsvector
*
* Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
*
*
* IDENTIFICATION
* src/backend/utils/adt/tsvector.c
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "common/int.h"
#include "libpq/pqformat.h"
#include "nodes/miscnodes.h"
#include "tsearch/ts_locale.h"
#include "tsearch/ts_utils.h"
#include "utils/fmgrprotos.h"
#include "utils/memutils.h"
#include "varatt.h"
typedef struct
{
WordEntry entry; /* must be first, see compareentry */
WordEntryPos *pos;
int poslen; /* number of elements in pos */
} WordEntryIN;
/* Compare two WordEntryPos values for qsort */
int
compareWordEntryPos(const void *a, const void *b)
{
int apos = WEP_GETPOS(*(const WordEntryPos *) a);
int bpos = WEP_GETPOS(*(const WordEntryPos *) b);
return pg_cmp_s32(apos, bpos);
}
/*
* Removes duplicate pos entries. If there's two entries with same pos but
* different weight, the higher weight is retained, so we can't use
* qunique here.
*
* Returns new length.
*/
static int
uniquePos(WordEntryPos *a, int l)
{
WordEntryPos *ptr,
*res;
if (l <= 1)
return l;
qsort(a, l, sizeof(WordEntryPos), compareWordEntryPos);
res = a;
ptr = a + 1;
while (ptr - a < l)
{
if (WEP_GETPOS(*ptr) != WEP_GETPOS(*res))
{
res++;
*res = *ptr;
if (res - a >= MAXNUMPOS - 1 ||
WEP_GETPOS(*res) == MAXENTRYPOS - 1)
break;
}
else if (WEP_GETWEIGHT(*ptr) > WEP_GETWEIGHT(*res))
WEP_SETWEIGHT(*res, WEP_GETWEIGHT(*ptr));
ptr++;
}
return res + 1 - a;
}
/*
* Compare two WordEntry structs for qsort_arg. This can also be used on
* WordEntryIN structs, since those have WordEntry as their first field.
*/
static int
compareentry(const void *va, const void *vb, void *arg)
{
const WordEntry *a = (const WordEntry *) va;
const WordEntry *b = (const WordEntry *) vb;
char *BufferStr = (char *) arg;
return tsCompareString(&BufferStr[a->pos], a->len,
&BufferStr[b->pos], b->len,
false);
}
/*
* Sort an array of WordEntryIN, remove duplicates.
* *outbuflen receives the amount of space needed for strings and positions.
*/
static int
uniqueentry(WordEntryIN *a, int l, char *buf, int *outbuflen)
{
int buflen;
WordEntryIN *ptr,
*res;
Assert(l >= 1);
if (l > 1)
qsort_arg(a, l, sizeof(WordEntryIN), compareentry, buf);
buflen = 0;
res = a;
ptr = a + 1;
while (ptr - a < l)
{
if (!(ptr->entry.len == res->entry.len &&
strncmp(&buf[ptr->entry.pos], &buf[res->entry.pos],
res->entry.len) == 0))
{
/* done accumulating data into *res, count space needed */
buflen += res->entry.len;
if (res->entry.haspos)
{
res->poslen = uniquePos(res->pos, res->poslen);
buflen = SHORTALIGN(buflen);
buflen += res->poslen * sizeof(WordEntryPos) + sizeof(uint16);
}
res++;
if (res != ptr)
memcpy(res, ptr, sizeof(WordEntryIN));
}
else if (ptr->entry.haspos)
{
if (res->entry.haspos)
{
/* append ptr's positions to res's positions */
int newlen = ptr->poslen + res->poslen;
res->pos = (WordEntryPos *)
repalloc(res->pos, newlen * sizeof(WordEntryPos));
memcpy(&res->pos[res->poslen], ptr->pos,
ptr->poslen * sizeof(WordEntryPos));
res->poslen = newlen;
pfree(ptr->pos);
}
else
{
/* just give ptr's positions to pos */
res->entry.haspos = 1;
res->pos = ptr->pos;
res->poslen = ptr->poslen;
}
}
ptr++;
}
/* count space needed for last item */
buflen += res->entry.len;
if (res->entry.haspos)
{
res->poslen = uniquePos(res->pos, res->poslen);
buflen = SHORTALIGN(buflen);
buflen += res->poslen * sizeof(WordEntryPos) + sizeof(uint16);
}
*outbuflen = buflen;
return res + 1 - a;
}
Datum
tsvectorin(PG_FUNCTION_ARGS)
{
char *buf = PG_GETARG_CSTRING(0);
Node *escontext = fcinfo->context;
TSVectorParseState state;
WordEntryIN *arr;
int totallen;
int arrlen; /* allocated size of arr */
WordEntry *inarr;
int len = 0;
TSVector in;
int i;
char *token;
int toklen;
WordEntryPos *pos;
int poslen;
char *strbuf;
int stroff;
/*
* Tokens are appended to tmpbuf, cur is a pointer to the end of used
* space in tmpbuf.
*/
char *tmpbuf;
char *cur;
int buflen = 256; /* allocated size of tmpbuf */
state = init_tsvector_parser(buf, 0, escontext);
arrlen = 64;
arr = (WordEntryIN *) palloc(sizeof(WordEntryIN) * arrlen);
cur = tmpbuf = (char *) palloc(buflen);
while (gettoken_tsvector(state, &token, &toklen, &pos, &poslen, NULL))
{
if (toklen >= MAXSTRLEN)
ereturn(escontext, (Datum) 0,
(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
errmsg("word is too long (%ld bytes, max %ld bytes)",
(long) toklen,
(long) (MAXSTRLEN - 1))));
if (cur - tmpbuf > MAXSTRPOS)
ereturn(escontext, (Datum) 0,
(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
errmsg("string is too long for tsvector (%ld bytes, max %ld bytes)",
(long) (cur - tmpbuf), (long) MAXSTRPOS)));
/*
* Enlarge buffers if needed
*/
if (len >= arrlen)
{
arrlen *= 2;
arr = (WordEntryIN *)
repalloc(arr, sizeof(WordEntryIN) * arrlen);
}
while ((cur - tmpbuf) + toklen >= buflen)
{
int dist = cur - tmpbuf;
buflen *= 2;
tmpbuf = (char *) repalloc(tmpbuf, buflen);
cur = tmpbuf + dist;
}
arr[len].entry.len = toklen;
arr[len].entry.pos = cur - tmpbuf;
memcpy(cur, token, toklen);
cur += toklen;
if (poslen != 0)
{
arr[len].entry.haspos = 1;
arr[len].pos = pos;
arr[len].poslen = poslen;
}
else
{
arr[len].entry.haspos = 0;
arr[len].pos = NULL;
arr[len].poslen = 0;
}
len++;
}
close_tsvector_parser(state);
/* Did gettoken_tsvector fail? */
if (SOFT_ERROR_OCCURRED(escontext))
PG_RETURN_NULL();
if (len > 0)
len = uniqueentry(arr, len, tmpbuf, &buflen);
else
buflen = 0;
if (buflen > MAXSTRPOS)
ereturn(escontext, (Datum) 0,
(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
errmsg("string is too long for tsvector (%d bytes, max %d bytes)", buflen, MAXSTRPOS)));
totallen = CALCDATASIZE(len, buflen);
in = (TSVector) palloc0(totallen);
SET_VARSIZE(in, totallen);
in->size = len;
inarr = ARRPTR(in);
strbuf = STRPTR(in);
stroff = 0;
for (i = 0; i < len; i++)
{
memcpy(strbuf + stroff, &tmpbuf[arr[i].entry.pos], arr[i].entry.len);
arr[i].entry.pos = stroff;
stroff += arr[i].entry.len;
if (arr[i].entry.haspos)
{
/* This should be unreachable because of MAXNUMPOS restrictions */
if (arr[i].poslen > 0xFFFF)
elog(ERROR, "positions array too long");
/* Copy number of positions */
stroff = SHORTALIGN(stroff);
*(uint16 *) (strbuf + stroff) = (uint16) arr[i].poslen;
stroff += sizeof(uint16);
/* Copy positions */
memcpy(strbuf + stroff, arr[i].pos, arr[i].poslen * sizeof(WordEntryPos));
stroff += arr[i].poslen * sizeof(WordEntryPos);
pfree(arr[i].pos);
}
inarr[i] = arr[i].entry;
}
Assert((strbuf + stroff - (char *) in) == totallen);
PG_RETURN_TSVECTOR(in);
}
Datum
tsvectorout(PG_FUNCTION_ARGS)
{
TSVector out = PG_GETARG_TSVECTOR(0);
char *outbuf;
int32 i,
lenbuf = 0,
pp;
WordEntry *ptr = ARRPTR(out);
char *curbegin,
*curin,
*curout;
lenbuf = out->size * 2 /* '' */ + out->size - 1 /* space */ + 2 /* \0 */ ;
for (i = 0; i < out->size; i++)
{
lenbuf += ptr[i].len * 2 * pg_database_encoding_max_length() /* for escape */ ;
if (ptr[i].haspos)
lenbuf += 1 /* : */ + 7 /* int2 + , + weight */ * POSDATALEN(out, &(ptr[i]));
}
curout = outbuf = (char *) palloc(lenbuf);
for (i = 0; i < out->size; i++)
{
curbegin = curin = STRPTR(out) + ptr->pos;
if (i != 0)
*curout++ = ' ';
*curout++ = '\'';
while (curin - curbegin < ptr->len)
{
int len = pg_mblen(curin);
if (t_iseq(curin, '\''))
*curout++ = '\'';
else if (t_iseq(curin, '\\'))
*curout++ = '\\';
while (len--)
*curout++ = *curin++;
}
*curout++ = '\'';
if ((pp = POSDATALEN(out, ptr)) != 0)
{
WordEntryPos *wptr;
*curout++ = ':';
wptr = POSDATAPTR(out, ptr);
while (pp)
{
curout += sprintf(curout, "%d", WEP_GETPOS(*wptr));
switch (WEP_GETWEIGHT(*wptr))
{
case 3:
*curout++ = 'A';
break;
case 2:
*curout++ = 'B';
break;
case 1:
*curout++ = 'C';
break;
case 0:
default:
break;
}
if (pp > 1)
*curout++ = ',';
pp--;
wptr++;
}
}
ptr++;
}
*curout = '\0';
PG_FREE_IF_COPY(out, 0);
PG_RETURN_CSTRING(outbuf);
}
/*
* Binary Input / Output functions. The binary format is as follows:
*
* uint32 number of lexemes
*
* for each lexeme:
* lexeme text in client encoding, null-terminated
* uint16 number of positions
* for each position:
* uint16 WordEntryPos
*/
Datum
tsvectorsend(PG_FUNCTION_ARGS)
{
TSVector vec = PG_GETARG_TSVECTOR(0);
StringInfoData buf;
int i,
j;
WordEntry *weptr = ARRPTR(vec);
pq_begintypsend(&buf);
pq_sendint32(&buf, vec->size);
for (i = 0; i < vec->size; i++)
{
uint16 npos;
/*
* the strings in the TSVector array are not null-terminated, so we
* have to send the null-terminator separately
*/
pq_sendtext(&buf, STRPTR(vec) + weptr->pos, weptr->len);
pq_sendbyte(&buf, '\0');
npos = POSDATALEN(vec, weptr);
pq_sendint16(&buf, npos);
if (npos > 0)
{
WordEntryPos *wepptr = POSDATAPTR(vec, weptr);
for (j = 0; j < npos; j++)
pq_sendint16(&buf, wepptr[j]);
}
weptr++;
}
PG_RETURN_BYTEA_P(pq_endtypsend(&buf));
}
Datum
tsvectorrecv(PG_FUNCTION_ARGS)
{
StringInfo buf = (StringInfo) PG_GETARG_POINTER(0);
TSVector vec;
int i;
int32 nentries;
int datalen; /* number of bytes used in the variable size
* area after fixed size TSVector header and
* WordEntries */
Size hdrlen;
Size len; /* allocated size of vec */
bool needSort = false;
nentries = pq_getmsgint(buf, sizeof(int32));
if (nentries < 0 || nentries > (MaxAllocSize / sizeof(WordEntry)))
elog(ERROR, "invalid size of tsvector");
hdrlen = DATAHDRSIZE + sizeof(WordEntry) * nentries;
len = hdrlen * 2; /* times two to make room for lexemes */
vec = (TSVector) palloc0(len);
vec->size = nentries;
datalen = 0;
for (i = 0; i < nentries; i++)
{
const char *lexeme;
uint16 npos;
size_t lex_len;
lexeme = pq_getmsgstring(buf);
npos = (uint16) pq_getmsgint(buf, sizeof(uint16));
/* sanity checks */
lex_len = strlen(lexeme);
if (lex_len > MAXSTRLEN)
elog(ERROR, "invalid tsvector: lexeme too long");
if (datalen > MAXSTRPOS)
elog(ERROR, "invalid tsvector: maximum total lexeme length exceeded");
if (npos > MAXNUMPOS)
elog(ERROR, "unexpected number of tsvector positions");
/*
* Looks valid. Fill the WordEntry struct, and copy lexeme.
*
* But make sure the buffer is large enough first.
*/
while (hdrlen + SHORTALIGN(datalen + lex_len) +
sizeof(uint16) + npos * sizeof(WordEntryPos) >= len)
{
len *= 2;
vec = (TSVector) repalloc(vec, len);
}
vec->entries[i].haspos = (npos > 0) ? 1 : 0;
vec->entries[i].len = lex_len;
vec->entries[i].pos = datalen;
memcpy(STRPTR(vec) + datalen, lexeme, lex_len);
datalen += lex_len;
if (i > 0 && compareentry(&vec->entries[i],
&vec->entries[i - 1],
STRPTR(vec)) <= 0)
needSort = true;
/* Receive positions */
if (npos > 0)
{
uint16 j;
WordEntryPos *wepptr;
/*
* Pad to 2-byte alignment if necessary. Though we used palloc0
* for the initial allocation, subsequent repalloc'd memory areas
* are not initialized to zero.
*/
if (datalen != SHORTALIGN(datalen))
{
*(STRPTR(vec) + datalen) = '\0';
datalen = SHORTALIGN(datalen);
}
memcpy(STRPTR(vec) + datalen, &npos, sizeof(uint16));
wepptr = POSDATAPTR(vec, &vec->entries[i]);
for (j = 0; j < npos; j++)
{
wepptr[j] = (WordEntryPos) pq_getmsgint(buf, sizeof(WordEntryPos));
if (j > 0 && WEP_GETPOS(wepptr[j]) <= WEP_GETPOS(wepptr[j - 1]))
elog(ERROR, "position information is misordered");
}
datalen += sizeof(uint16) + npos * sizeof(WordEntryPos);
}
}
SET_VARSIZE(vec, hdrlen + datalen);
if (needSort)
qsort_arg(ARRPTR(vec), vec->size, sizeof(WordEntry),
compareentry, STRPTR(vec));
PG_RETURN_TSVECTOR(vec);
}