mirror of
https://github.com/postgres/postgres.git
synced 2025-05-06 19:59:18 +03:00
Avoid use of sscanf() to parse ispell dictionary files.
It turns out that on FreeBSD-derived platforms (including OS X), the *scanf() family of functions is pretty much brain-dead about multibyte characters. In particular it will apply isspace() to individual bytes of input even when those bytes are part of a multibyte character, thus allowing false recognition of a field-terminating space. We appear to have little alternative other than instituting a coding rule that *scanf() is not to be used if the input string might contain multibyte characters. (There was some discussion of relying on "%ls", but that probably just moves the portability problem somewhere else, and besides it doesn't fully prevent BSD *scanf() from using isspace().) This patch is a down payment on that: it gets rid of use of sscanf() to parse ispell dictionary files, which are certainly at great risk of having a problem. The code is cleaner this way anyway, though a bit longer. In passing, improve a few comments. Report and patch by Artur Zakirov, reviewed and somewhat tweaked by me. Back-patch to all supported branches.
This commit is contained in:
parent
616eaa396a
commit
b10635bb5b
@ -457,13 +457,149 @@ NIAddAffix(IspellDict *Conf, int flag, char flagflags, const char *mask, const c
|
|||||||
Conf->naffixes++;
|
Conf->naffixes++;
|
||||||
}
|
}
|
||||||
|
|
||||||
#define PAE_WAIT_MASK 0
|
|
||||||
#define PAE_INMASK 1
|
|
||||||
#define PAE_WAIT_FIND 2
|
|
||||||
#define PAE_INFIND 3
|
|
||||||
#define PAE_WAIT_REPL 4
|
|
||||||
#define PAE_INREPL 5
|
|
||||||
|
|
||||||
|
/* Parsing states for parse_affentry() and friends */
|
||||||
|
#define PAE_WAIT_MASK 0
|
||||||
|
#define PAE_INMASK 1
|
||||||
|
#define PAE_WAIT_FIND 2
|
||||||
|
#define PAE_INFIND 3
|
||||||
|
#define PAE_WAIT_REPL 4
|
||||||
|
#define PAE_INREPL 5
|
||||||
|
#define PAE_WAIT_TYPE 6
|
||||||
|
#define PAE_WAIT_FLAG 7
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Parse next space-separated field of an .affix file line.
|
||||||
|
*
|
||||||
|
* *str is the input pointer (will be advanced past field)
|
||||||
|
* next is where to copy the field value to, with null termination
|
||||||
|
*
|
||||||
|
* The buffer at "next" must be of size BUFSIZ; we truncate the input to fit.
|
||||||
|
*
|
||||||
|
* Returns TRUE if we found a field, FALSE if not.
|
||||||
|
*/
|
||||||
|
static bool
|
||||||
|
get_nextfield(char **str, char *next)
|
||||||
|
{
|
||||||
|
int state = PAE_WAIT_MASK;
|
||||||
|
int avail = BUFSIZ;
|
||||||
|
|
||||||
|
while (**str)
|
||||||
|
{
|
||||||
|
if (state == PAE_WAIT_MASK)
|
||||||
|
{
|
||||||
|
if (t_iseq(*str, '#'))
|
||||||
|
return false;
|
||||||
|
else if (!t_isspace(*str))
|
||||||
|
{
|
||||||
|
int clen = pg_mblen(*str);
|
||||||
|
|
||||||
|
if (clen < avail)
|
||||||
|
{
|
||||||
|
COPYCHAR(next, *str);
|
||||||
|
next += clen;
|
||||||
|
avail -= clen;
|
||||||
|
}
|
||||||
|
state = PAE_INMASK;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else /* state == PAE_INMASK */
|
||||||
|
{
|
||||||
|
if (t_isspace(*str))
|
||||||
|
{
|
||||||
|
*next = '\0';
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
int clen = pg_mblen(*str);
|
||||||
|
|
||||||
|
if (clen < avail)
|
||||||
|
{
|
||||||
|
COPYCHAR(next, *str);
|
||||||
|
next += clen;
|
||||||
|
avail -= clen;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
*str += pg_mblen(*str);
|
||||||
|
}
|
||||||
|
|
||||||
|
*next = '\0';
|
||||||
|
|
||||||
|
return (state == PAE_INMASK); /* OK if we got a nonempty field */
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Parses entry of an .affix file of MySpell or Hunspell format.
|
||||||
|
*
|
||||||
|
* An .affix file entry has the following format:
|
||||||
|
* - header
|
||||||
|
* <type> <flag> <cross_flag> <flag_count>
|
||||||
|
* - fields after header:
|
||||||
|
* <type> <flag> <find> <replace> <mask>
|
||||||
|
*
|
||||||
|
* str is the input line
|
||||||
|
* field values are returned to type etc, which must be buffers of size BUFSIZ.
|
||||||
|
*
|
||||||
|
* Returns number of fields found; any omitted fields are set to empty strings.
|
||||||
|
*/
|
||||||
|
static int
|
||||||
|
parse_ooaffentry(char *str, char *type, char *flag, char *find,
|
||||||
|
char *repl, char *mask)
|
||||||
|
{
|
||||||
|
int state = PAE_WAIT_TYPE;
|
||||||
|
int fields_read = 0;
|
||||||
|
bool valid = false;
|
||||||
|
|
||||||
|
*type = *flag = *find = *repl = *mask = '\0';
|
||||||
|
|
||||||
|
while (*str)
|
||||||
|
{
|
||||||
|
switch (state)
|
||||||
|
{
|
||||||
|
case PAE_WAIT_TYPE:
|
||||||
|
valid = get_nextfield(&str, type);
|
||||||
|
state = PAE_WAIT_FLAG;
|
||||||
|
break;
|
||||||
|
case PAE_WAIT_FLAG:
|
||||||
|
valid = get_nextfield(&str, flag);
|
||||||
|
state = PAE_WAIT_FIND;
|
||||||
|
break;
|
||||||
|
case PAE_WAIT_FIND:
|
||||||
|
valid = get_nextfield(&str, find);
|
||||||
|
state = PAE_WAIT_REPL;
|
||||||
|
break;
|
||||||
|
case PAE_WAIT_REPL:
|
||||||
|
valid = get_nextfield(&str, repl);
|
||||||
|
state = PAE_WAIT_MASK;
|
||||||
|
break;
|
||||||
|
case PAE_WAIT_MASK:
|
||||||
|
valid = get_nextfield(&str, mask);
|
||||||
|
state = -1; /* force loop exit */
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
elog(ERROR, "unrecognized state in parse_ooaffentry: %d",
|
||||||
|
state);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
if (valid)
|
||||||
|
fields_read++;
|
||||||
|
else
|
||||||
|
break; /* early EOL */
|
||||||
|
if (state < 0)
|
||||||
|
break; /* got all fields */
|
||||||
|
}
|
||||||
|
|
||||||
|
return fields_read;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Parses entry of an .affix file of Ispell format
|
||||||
|
*
|
||||||
|
* An .affix file entry has the following format:
|
||||||
|
* <mask> > [-<find>,]<replace>
|
||||||
|
*/
|
||||||
static bool
|
static bool
|
||||||
parse_affentry(char *str, char *mask, char *find, char *repl)
|
parse_affentry(char *str, char *mask, char *find, char *repl)
|
||||||
{
|
{
|
||||||
@ -618,8 +754,6 @@ NIImportOOAffixes(IspellDict *Conf, const char *filename)
|
|||||||
int flag = 0;
|
int flag = 0;
|
||||||
char flagflags = 0;
|
char flagflags = 0;
|
||||||
tsearch_readline_state trst;
|
tsearch_readline_state trst;
|
||||||
int scanread = 0;
|
|
||||||
char scanbuf[BUFSIZ];
|
|
||||||
char *recoded;
|
char *recoded;
|
||||||
|
|
||||||
/* read file to find any flag */
|
/* read file to find any flag */
|
||||||
@ -682,8 +816,6 @@ NIImportOOAffixes(IspellDict *Conf, const char *filename)
|
|||||||
}
|
}
|
||||||
tsearch_readline_end(&trst);
|
tsearch_readline_end(&trst);
|
||||||
|
|
||||||
sprintf(scanbuf, "%%6s %%%ds %%%ds %%%ds %%%ds", BUFSIZ / 5, BUFSIZ / 5, BUFSIZ / 5, BUFSIZ / 5);
|
|
||||||
|
|
||||||
if (!tsearch_readline_begin(&trst, filename))
|
if (!tsearch_readline_begin(&trst, filename))
|
||||||
ereport(ERROR,
|
ereport(ERROR,
|
||||||
(errcode(ERRCODE_CONFIG_FILE_ERROR),
|
(errcode(ERRCODE_CONFIG_FILE_ERROR),
|
||||||
@ -692,18 +824,21 @@ NIImportOOAffixes(IspellDict *Conf, const char *filename)
|
|||||||
|
|
||||||
while ((recoded = tsearch_readline(&trst)) != NULL)
|
while ((recoded = tsearch_readline(&trst)) != NULL)
|
||||||
{
|
{
|
||||||
|
int fields_read;
|
||||||
|
|
||||||
if (*recoded == '\0' || t_isspace(recoded) || t_iseq(recoded, '#'))
|
if (*recoded == '\0' || t_isspace(recoded) || t_iseq(recoded, '#'))
|
||||||
goto nextline;
|
goto nextline;
|
||||||
|
|
||||||
scanread = sscanf(recoded, scanbuf, type, sflag, find, repl, mask);
|
fields_read = parse_ooaffentry(recoded, type, sflag, find, repl, mask);
|
||||||
|
|
||||||
if (ptype)
|
if (ptype)
|
||||||
pfree(ptype);
|
pfree(ptype);
|
||||||
ptype = lowerstr_ctx(Conf, type);
|
ptype = lowerstr_ctx(Conf, type);
|
||||||
if (scanread < 4 || (STRNCMP(ptype, "sfx") && STRNCMP(ptype, "pfx")))
|
if (fields_read < 4 ||
|
||||||
|
(STRNCMP(ptype, "sfx") != 0 && STRNCMP(ptype, "pfx") != 0))
|
||||||
goto nextline;
|
goto nextline;
|
||||||
|
|
||||||
if (scanread == 4)
|
if (fields_read == 4)
|
||||||
{
|
{
|
||||||
if (strlen(sflag) != 1)
|
if (strlen(sflag) != 1)
|
||||||
goto nextline;
|
goto nextline;
|
||||||
@ -722,9 +857,13 @@ NIImportOOAffixes(IspellDict *Conf, const char *filename)
|
|||||||
if (strlen(sflag) != 1 || flag != *sflag || flag == 0)
|
if (strlen(sflag) != 1 || flag != *sflag || flag == 0)
|
||||||
goto nextline;
|
goto nextline;
|
||||||
prepl = lowerstr_ctx(Conf, repl);
|
prepl = lowerstr_ctx(Conf, repl);
|
||||||
/* affix flag */
|
/* Find position of '/' in lowercased string "prepl" */
|
||||||
if ((ptr = strchr(prepl, '/')) != NULL)
|
if ((ptr = strchr(prepl, '/')) != NULL)
|
||||||
{
|
{
|
||||||
|
/*
|
||||||
|
* Here we use non-lowercased string "repl". We need position
|
||||||
|
* of '/' in "repl".
|
||||||
|
*/
|
||||||
*ptr = '\0';
|
*ptr = '\0';
|
||||||
ptr = repl + (ptr - prepl) + 1;
|
ptr = repl + (ptr - prepl) + 1;
|
||||||
while (*ptr)
|
while (*ptr)
|
||||||
@ -800,11 +939,12 @@ NIImportAffixes(IspellDict *Conf, const char *filename)
|
|||||||
|
|
||||||
if (STRNCMP(pstr, "compoundwords") == 0)
|
if (STRNCMP(pstr, "compoundwords") == 0)
|
||||||
{
|
{
|
||||||
|
/* Find position in lowercased string "pstr" */
|
||||||
s = findchar(pstr, 'l');
|
s = findchar(pstr, 'l');
|
||||||
if (s)
|
if (s)
|
||||||
{
|
{
|
||||||
s = recoded + (s - pstr); /* we need non-lowercased
|
/* Here we use non-lowercased string "recoded" */
|
||||||
* string */
|
s = recoded + (s - pstr);
|
||||||
while (*s && !t_isspace(s))
|
while (*s && !t_isspace(s))
|
||||||
s += pg_mblen(s);
|
s += pg_mblen(s);
|
||||||
while (*s && t_isspace(s))
|
while (*s && t_isspace(s))
|
||||||
|
Loading…
x
Reference in New Issue
Block a user