1
0
mirror of https://github.com/postgres/postgres.git synced 2025-10-22 14:32:25 +03:00

Cleanup for some problems in tsearch patch:

- ispell initialization crashed on empty dictionary file
- ispell initialization crashed on affix file with prefixes but no suffixes
- stop words file was run through pg_verify_mbstr, with database
  encoding, but it's supposed to be UTF-8; similar bug for synonym files
- bunch of comments added, typos fixed, and other cleanup

Introduced consistent encoding checking/conversion of data read from tsearch
configuration files, by doing this in a single t_readline() subroutine
(replacing direct usages of fgets).  Cleaned up API for readstopwords too.

Heikki Linnakangas
This commit is contained in:
Tom Lane
2007-08-25 00:03:59 +00:00
parent b918bf86c6
commit 7351b5fa17
14 changed files with 344 additions and 331 deletions

View File

@@ -7,7 +7,7 @@
*
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/tsearch/ts_utils.c,v 1.2 2007/08/22 01:39:44 tgl Exp $
* $PostgreSQL: pgsql/src/backend/tsearch/ts_utils.c,v 1.3 2007/08/25 00:03:59 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@@ -63,21 +63,29 @@ get_tsearch_config_filename(const char *basename,
return result;
}
#define STOPBUFLEN 4096
static int
comparestr(const void *a, const void *b)
{
return strcmp(*(char **) a, *(char **) b);
}
/*
* Reads a stopword file. Each word is run through 'wordop'
* function, if given. wordop may either modify the input in-place,
* or palloc a new version.
*/
void
readstoplist(char *in, StopList * s)
readstoplist(const char *fname, StopList *s, char *(*wordop) (char *))
{
char **stop = NULL;
s->len = 0;
if (in && *in)
if (fname && *fname)
{
char *filename = get_tsearch_config_filename(in, "stop");
char *filename = get_tsearch_config_filename(fname, "stop");
FILE *hin;
char buf[STOPBUFLEN];
char *line;
int reallen = 0;
int line = 0;
if ((hin = AllocateFile(filename, "r")) == NULL)
ereport(ERROR,
@@ -85,65 +93,56 @@ readstoplist(char *in, StopList * s)
errmsg("could not open stopword file \"%s\": %m",
filename)));
while (fgets(buf, STOPBUFLEN, hin))
while ((line = t_readline(hin)) != NULL)
{
char *pbuf = buf;
char *pbuf = line;
line++;
while (*pbuf && !isspace(*pbuf))
/* Trim trailing space */
while (*pbuf && !t_isspace(pbuf))
pbuf++;
*pbuf = '\0';
if (*buf == '\0')
continue;
if (!pg_verifymbstr(buf, strlen(buf), true))
/* Skip empty lines */
if (*line == '\0')
{
FreeFile(hin);
ereport(ERROR,
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
errmsg("invalid multibyte encoding at line %d in file \"%s\"",
line, filename)));
pfree(line);
continue;
}
if (s->len >= reallen)
{
if (reallen == 0)
{
reallen = 16;
reallen = 64;
stop = (char **) palloc(sizeof(char *) * reallen);
}
else
{
reallen *= 2;
stop = (char **) repalloc((void *) stop, sizeof(char *) * reallen);
stop = (char **) repalloc((void *) stop,
sizeof(char *) * reallen);
}
}
if (s->wordop)
stop[s->len] = s->wordop(buf);
if (wordop)
{
stop[s->len] = wordop(line);
if (stop[s->len] != line)
pfree(line);
}
else
stop[s->len] = pstrdup(buf);
stop[s->len] = line;
(s->len)++;
}
FreeFile(hin);
pfree(filename);
}
s->stop = stop;
}
static int
comparestr(const void *a, const void *b)
{
return strcmp(*(char **) a, *(char **) b);
}
void
sortstoplist(StopList * s)
{
/* Sort to allow binary searching */
if (s->stop && s->len > 0)
qsort(s->stop, s->len, sizeof(char *), comparestr);
}