Avoid use of sscanf() to parse ispell dictionary files.

It turns out that on FreeBSD-derived platforms (including OS X), the *scanf() family of functions is pretty much brain-dead about multibyte characters. In particular it will apply isspace() to individual bytes of input even when those bytes are part of a multibyte character, thus allowing false recognition of a field-terminating space. We appear to have little alternative other than instituting a coding rule that *scanf() is not to be used if the input string might contain multibyte characters. (There was some discussion of relying on "%ls", but that probably just moves the portability problem somewhere else, and besides it doesn't fully prevent BSD *scanf() from using isspace().) This patch is a down payment on that: it gets rid of use of sscanf() to parse ispell dictionary files, which are certainly at great risk of having a problem. The code is cleaner this way anyway, though a bit longer. In passing, improve a few comments. Report and patch by Artur Zakirov, reviewed and somewhat tweaked by me. Back-patch to all supported branches.
2025-11-15 03:41:20 +03:00 · 2016-02-10 19:30:11 -05:00
parent 616eaa396a
commit b10635bb5b
1 changed files with 156 additions and 16 deletions
--- a/src/backend/tsearch/spell.c
+++ b/src/backend/tsearch/spell.c
@@ -457,13 +457,149 @@ NIAddAffix(IspellDict *Conf, int flag, char flagflags, const char *mask, const c
 	Conf->naffixes++;
 }
 #define PAE_WAIT_MASK	0
 #define PAE_INMASK	1
 #define PAE_WAIT_FIND	2
 #define PAE_INFIND	3
 #define PAE_WAIT_REPL	4
 #define PAE_INREPL	5
 /* Parsing states for parse_affentry() and friends */
 #define PAE_WAIT_MASK	0
 #define PAE_INMASK		1
 #define PAE_WAIT_FIND	2
 #define PAE_INFIND		3
 #define PAE_WAIT_REPL	4
 #define PAE_INREPL		5
 #define PAE_WAIT_TYPE	6
 #define PAE_WAIT_FLAG	7
 /*
 * Parse next space-separated field of an .affix file line.
 *
 * *str is the input pointer (will be advanced past field)
 * next is where to copy the field value to, with null termination
 *
 * The buffer at "next" must be of size BUFSIZ; we truncate the input to fit.
 *
 * Returns TRUE if we found a field, FALSE if not.
 */
 static bool
 get_nextfield(char **str, char *next)
 {
 	int			state = PAE_WAIT_MASK;
 	int			avail = BUFSIZ;
 	while (**str)
 	{
 		if (state == PAE_WAIT_MASK)
 		{
 			if (t_iseq(*str, '#'))
 				return false;
 			else if (!t_isspace(*str))
 			{
 				int			clen = pg_mblen(*str);
 				if (clen < avail)
 				{
 					COPYCHAR(next, *str);
 					next += clen;
 					avail -= clen;
 				}
 				state = PAE_INMASK;
 			}
 		}
 		else	/* state == PAE_INMASK */
 		{
 			if (t_isspace(*str))
 			{
 				*next = '\0';
 				return true;
 			}
 			else
 			{
 				int			clen = pg_mblen(*str);
 				if (clen < avail)
 				{
 					COPYCHAR(next, *str);
 					next += clen;
 					avail -= clen;
 				}
 			}
 		}
 		*str += pg_mblen(*str);
 	}
 	*next = '\0';
 	return (state == PAE_INMASK);		/* OK if we got a nonempty field */
 }
 /*
 * Parses entry of an .affix file of MySpell or Hunspell format.
 *
 * An .affix file entry has the following format:
 * - header
 *	 <type>  <flag>  <cross_flag>  <flag_count>
 * - fields after header:
 *	 <type>  <flag>  <find>  <replace>	<mask>
 *
 * str is the input line
 * field values are returned to type etc, which must be buffers of size BUFSIZ.
 *
 * Returns number of fields found; any omitted fields are set to empty strings.
 */
 static int
 parse_ooaffentry(char *str, char *type, char *flag, char *find,
 				 char *repl, char *mask)
 {
 	int			state = PAE_WAIT_TYPE;
 	int			fields_read = 0;
 	bool		valid = false;
 	*type = *flag = *find = *repl = *mask = '\0';
 	while (*str)
 	{
 		switch (state)
 		{
 			case PAE_WAIT_TYPE:
 				valid = get_nextfield(&str, type);
 				state = PAE_WAIT_FLAG;
 				break;
 			case PAE_WAIT_FLAG:
 				valid = get_nextfield(&str, flag);
 				state = PAE_WAIT_FIND;
 				break;
 			case PAE_WAIT_FIND:
 				valid = get_nextfield(&str, find);
 				state = PAE_WAIT_REPL;
 				break;
 			case PAE_WAIT_REPL:
 				valid = get_nextfield(&str, repl);
 				state = PAE_WAIT_MASK;
 				break;
 			case PAE_WAIT_MASK:
 				valid = get_nextfield(&str, mask);
 				state = -1;		/* force loop exit */
 				break;
 			default:
 				elog(ERROR, "unrecognized state in parse_ooaffentry: %d",
 					 state);
 				break;
 		}
 		if (valid)
 			fields_read++;
 		else
 			break;				/* early EOL */
 		if (state < 0)
 			break;				/* got all fields */
 	}
 	return fields_read;
 }
 /*
 * Parses entry of an .affix file of Ispell format
 *
 * An .affix file entry has the following format:
 * <mask>  >  [-<find>,]<replace>
 */
 static bool
 parse_affentry(char *str, char *mask, char *find, char *repl)
 {
@@ -618,8 +754,6 @@ NIImportOOAffixes(IspellDict *Conf, const char *filename)
 	int			flag = 0;
 	char		flagflags = 0;
 	tsearch_readline_state trst;
 	int			scanread = 0;
 	char		scanbuf[BUFSIZ];
 	char	   *recoded;
 	/* read file to find any flag */
@@ -682,8 +816,6 @@ NIImportOOAffixes(IspellDict *Conf, const char *filename)
 	}
 	tsearch_readline_end(&trst);
 	sprintf(scanbuf, "%%6s %%%ds %%%ds %%%ds %%%ds", BUFSIZ / 5, BUFSIZ / 5, BUFSIZ / 5, BUFSIZ / 5);
 	if (!tsearch_readline_begin(&trst, filename))
 		ereport(ERROR,
 				(errcode(ERRCODE_CONFIG_FILE_ERROR),
@@ -692,18 +824,21 @@ NIImportOOAffixes(IspellDict *Conf, const char *filename)
 	while ((recoded = tsearch_readline(&trst)) != NULL)
 	{
 		int			fields_read;
 		if (*recoded == '\0' || t_isspace(recoded) || t_iseq(recoded, '#'))
 			goto nextline;
-		scanread = sscanf(recoded, scanbuf, type, sflag, find, repl, mask);
+		fields_read = parse_ooaffentry(recoded, type, sflag, find, repl, mask);
 		if (ptype)
 			pfree(ptype);
 		ptype = lowerstr_ctx(Conf, type);
-		if (scanread < 4 || (STRNCMP(ptype, "sfx") && STRNCMP(ptype, "pfx")))
+		if (fields_read < 4 ||
 			(STRNCMP(ptype, "sfx") != 0 && STRNCMP(ptype, "pfx") != 0))
 			goto nextline;
-		if (scanread == 4)
+		if (fields_read == 4)
 		{
 			if (strlen(sflag) != 1)
 				goto nextline;
@@ -722,9 +857,13 @@ NIImportOOAffixes(IspellDict *Conf, const char *filename)
 			if (strlen(sflag) != 1 || flag != *sflag || flag == 0)
 				goto nextline;
 			prepl = lowerstr_ctx(Conf, repl);
-			/* affix flag */
+			/* Find position of '/' in lowercased string "prepl" */
 			if ((ptr = strchr(prepl, '/')) != NULL)
 			{
 				/*
 				 * Here we use non-lowercased string "repl". We need position
 				 * of '/' in "repl".
 				 */
 				*ptr = '\0';
 				ptr = repl + (ptr - prepl) + 1;
 				while (*ptr)
@@ -800,11 +939,12 @@ NIImportAffixes(IspellDict *Conf, const char *filename)
 		if (STRNCMP(pstr, "compoundwords") == 0)
 		{
 			/* Find position in lowercased string "pstr" */
 			s = findchar(pstr, 'l');
 			if (s)
 			{
-				s = recoded + (s - pstr);		/* we need non-lowercased
+				/* Here we use non-lowercased string "recoded" */
-												 * string */
+				s = recoded + (s - pstr);
 				while (*s && !t_isspace(s))
 					s += pg_mblen(s);
 				while (*s && t_isspace(s))