Improve make_greater_string() with encoding-specific incrementers.

This infrastructure doesn't in any way guarantee that the character we produce will sort before the one we incremented; but it does at least make it much more likely that we'll end up with something that is a valid character, which improves our chances. Kyotaro Horiguchi, with various adjustments by me.
2025-11-07 19:06:32 +03:00 · 2011-10-29 14:22:20 -04:00
parent 51eba98cf4
commit 78d523b633
3 changed files with 297 additions and 28 deletions
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@@ -5665,6 +5665,19 @@ pattern_selectivity(Const *patt, Pattern_Type ptype)
 }
 /*
 * For bytea, the increment function need only increment the current byte
 * (there are no multibyte characters to worry about).
 */
 static bool
 byte_increment(unsigned char *ptr, int len)
 {
 	if (*ptr >= 255)
 		return false;
 	(*ptr)++;
 	return true;
 }
 /*
 * Try to generate a string greater than the given string or any
 * string it is a prefix of.  If successful, return a palloc'd string
@@ -5704,6 +5717,7 @@ make_greater_string(const Const *str_const, FmgrInfo *ltproc, Oid collation)
 	int			len;
 	Datum		cmpstr;
 	text	   *cmptxt = NULL;
 	mbcharacter_incrementer charinc;
 	/*
 	 * Get a modifiable copy of the prefix string in C-string format, and set
@@ -5765,29 +5779,33 @@ make_greater_string(const Const *str_const, FmgrInfo *ltproc, Oid collation)
 		}
 	}
 	if (datatype == BYTEAOID)
 		charinc = &byte_increment;
 	else
 		charinc = pg_database_encoding_character_incrementer();
 	while (len > 0)
 	{
-		unsigned char *lastchar = (unsigned char *) (workstr + len - 1);
+		int		charlen;
-		unsigned char savelastchar = *lastchar;
+		unsigned char *lastchar;
 		/*
 		 * Try to generate a larger string by incrementing the last byte.
 		 */
 		while (*lastchar < (unsigned char) 255)
 		{
 		Const	   *workstr_const;
-			(*lastchar)++;
+		if (datatype == BYTEAOID)
-
+			charlen = 1;
 			if (datatype != BYTEAOID)
 			{
 				/* do not generate invalid encoding sequences */
 				if (!pg_verifymbstr(workstr, len, true))
 					continue;
 				workstr_const = string_to_const(workstr, datatype);
 			}
 		else
 			charlen = len - pg_mbcliplen(workstr, len, len - 1);
 		lastchar = (unsigned char *) (workstr + len - charlen);
 		/*
 		 * Try to generate a larger string by incrementing the last character
 		 * (for BYTEA, we treat each byte as a character).
 		 */
 		if (charinc(lastchar, charlen))
 		{
 			if (datatype == BYTEAOID)
 				workstr_const = string_to_bytea_const(workstr, len);
 			else
 				workstr_const = string_to_const(workstr, datatype);
 			if (DatumGetBool(FunctionCall2Coll(ltproc,
 											   collation,
@@ -5806,19 +5824,10 @@ make_greater_string(const Const *str_const, FmgrInfo *ltproc, Oid collation)
 			pfree(workstr_const);
 		}
 		/* restore last byte so we don't confuse pg_mbcliplen */
 		*lastchar = savelastchar;
 		/*
-		 * Truncate off the last character, which might be more than 1 byte,
+		 * Truncate off the last character or byte.
 		 * depending on the character encoding.
 		 */
-		if (datatype != BYTEAOID && pg_database_encoding_max_length() > 1)
+		len -= charlen;
 			len = pg_mbcliplen(workstr, len, len - 1);
 		else
 			len -= 1;
 		if (datatype != BYTEAOID)
 		workstr[len] = '\0';
 	}
--- a/src/backend/utils/mb/wchar.c
+++ b/src/backend/utils/mb/wchar.c
@@ -1334,6 +1334,244 @@ pg_utf8_islegal(const unsigned char *source, int length)
 	return true;
 }
 #ifndef FRONTEND
 /*
 * Generic character increment function.
 *
 * Not knowing anything about the properties of the encoding in use, we just
 * keep incrementing the last byte until pg_verifymbstr() likes the result,
 * or we run out of values to try.
 *
 * Like all character-increment functions, we must restore the original input
 * string on failure.
 */
 static bool
 pg_generic_charinc(unsigned char *charptr, int len)
 {
 	unsigned char *lastchar = (unsigned char *) (charptr + len - 1);
 	unsigned char savelastchar = *lastchar;
 	const char *const_charptr = (const char *)charptr;
 	while (*lastchar < (unsigned char) 255)
 	{
 		(*lastchar)++;
 		if (!pg_verifymbstr(const_charptr, len, true))
 			continue;
 		return true;
 	}
 	*lastchar = savelastchar;
 	return false;
 }
 /*
 * UTF-8 character increment function.
 *
 * For a one-byte character less than 0x7F, we just increment the byte.
 *
 * For a multibyte character, every byte but the first must fall between 0x80
 * and 0xBF; and the first byte must be between 0xC0 and 0xF4.  We increment
 * the last byte that's not already at its maximum value, and set any following
 * bytes back to 0x80.  If we can't find a byte that's less than the maximum
 * allowable vale, we simply fail.  We also have some special-case logic to
 * skip regions used for surrogate pair handling, as those should not occur in
 * valid UTF-8.
 *
 * Like all character-increment functions, we must restore the original input
 * string on failure.
 */
 static bool
 pg_utf8_increment(unsigned char *charptr, int length)
 {
 	unsigned char a;
 	unsigned char bak[4];
 	unsigned char limit;
 	switch (length)
 	{
 		default:
 			/* reject lengths 5 and 6 for now */
 			return false;
 		case 4:
 			bak[3] = charptr[3];
 			a = charptr[3];
 			if (a < 0xBF)
 			{
 				charptr[3]++;
 				break;
 			}
 			charptr[3] = 0x80;
 			/* FALL THRU */
 		case 3:
 			bak[2] = charptr[2];
 			a = charptr[2];
 			if (a < 0xBF)
 			{
 				charptr[2]++;
 				break;
 			}
 			charptr[2] = 0x80;
 			/* FALL THRU */
 		case 2:
 			bak[1] = charptr[1];
 			a = charptr[1];
 			switch (*charptr)
 			{
 				case 0xED:
 					limit = 0x9F;
 					break;
 				case 0xF4:
 					limit = 0x8F;
 					break;
 				default:
 					limit = 0xBF;
 					break;
 			}
 			if (a < limit)
 			{
 				charptr[1]++;
 				break;
 			}
 			charptr[1] = 0x80;
 			/* FALL THRU */
 		case 1:
 			bak[0] = *charptr;
 			a = *charptr;
 			if (a == 0x7F || a == 0xDF || a == 0xEF || a == 0xF4)
 			{
 				/* Restore original string. */
 				memcpy(charptr, bak, length);
 				return false;
 			}
 			charptr[0]++;
 			break;
 	}
 	return true;
 }
 /*
 * EUC-JP character increment function.
 *
 * If the sequence starts with SS2(0x8e), it must be a two-byte sequence
 * representing JIS X 0201 characters with the second byte ranges between
 * 0xa1 and 0xde.  We just increment the last byte if it's less than 0xde,
 * and otherwise rewrite whole the sequence to 0xa1 0xa1.
 *
 * If the sequence starts with SS3(0x8f), it must be a three-byte sequence
 * which the last two bytes ranges between 0xa1 and 0xfe.  The last byte
 * is incremented, carrying overflow to the second-to-last byte.
 *
 * If the sequence starts with the values other than the aboves and its MSB
 * is set, it must be a two-byte sequence representing JIS X 0208 characters
 * with both bytes ranges between 0xa1 and 0xfe.  The last byte is incremented,
 * carrying overflow to the second-to-last byte.
 *
 * Otherwise the sequence is consists of single byte representing ASCII
 * characters. It is incremented up to 0x7f.
 *    
 * Only three EUC-JP byte sequences shown below - which have no character
 * allocated - make this function to fail in spite of its validity: 0x7f,
 * 0xfe 0xfe, 0x8f 0xfe 0xfe.
 */
 static bool
 pg_eucjp_increment(unsigned char *charptr, int length)
 {
 	unsigned char bak[3];
 	unsigned char c1, c2;
 	signed int i;
 	c1 = *charptr;
 	switch (c1)
 	{
 		case SS2:	/* JIS X 0201 */
 			if (length != 2)
 				return false;
 			c2 = charptr[1];
 			if (c2 > 0xde)
 				charptr[0] = charptr[1] = 0xa1;
 			else if (c2 < 0xa1)
 				charptr[1] = 0xa1;
 			else
 				charptr[1]++;
 			break;
 		case SS3:	/* JIS X 0212 */
 			if (length != 3)
 				return false;
 			for (i = 2; i > 0; i--)
 			{
 				bak[i] = charptr[i];
 				c2 = charptr[i];
 				if (c2 < 0xa1)
 				{
 					charptr[i] = 0xa1;
 					return true;
 				}
 				else if (c2 < 0xfe)
 				{
 					charptr[i]++;
 					break;
 				}
 				charptr[i] = 0xa1;
 			}
 			if (i == 0)	  /* Out of 3-byte code region */
 			{
 				charptr[1] = bak[1];
 				charptr[2] = bak[2];
 				return false;
 			}
 			break;
 		default:
 			if (IS_HIGHBIT_SET(c1))	 /* JIS X 0208? */
 			{
 				if (length != 2)
 					return false;
 				for (i = 1 ; i >= 0 ; i--)	/* i must be signed */
 				{
 					bak[i] = charptr[i];
 					c2 = charptr[i];
 					if (c2 < 0xa1)
 					{
 						charptr[i] = 0xa1;
 						return true;
 					}
 					else if (c2 < 0xfe)
 					{
 						charptr[i]++;
 						break;
 					}
 					charptr[i] = 0xa1;
 				}
 				if (i < 0)	/* Out of 2 byte code region */
 				{
 					charptr[0] = bak[0];
 					charptr[1] = bak[1];
 					return false;
 				}
 			}
 			else
 			{	/* ASCII, single byte */
 				if (c1 > 0x7e)
 					return false;
 				(*charptr)++;
 			}
 	}
 	return true;
 }
 #endif
 /*
 *-------------------------------------------------------------------
 * encoding info table
@@ -1458,6 +1696,25 @@ pg_database_encoding_max_length(void)
 	return pg_wchar_table[GetDatabaseEncoding()].maxmblen;
 }
 /*
 * give the character incrementer for the encoding for the current database
 */
 mbcharacter_incrementer
 pg_database_encoding_character_incrementer(void)
 {
 	switch (GetDatabaseEncoding())
 	{
 		case PG_UTF8:
 			return pg_utf8_increment;
 		case PG_EUC_JP:
 			return pg_eucjp_increment;
 		default:
 			return pg_generic_charinc;
 	}
 }
 /*
 * Verify mbstr to make sure that it is validly encoded in the current
 * database encoding.  Otherwise same as pg_verify_mbstr().
--- a/src/include/mb/pg_wchar.h
+++ b/src/include/mb/pg_wchar.h
@@ -284,6 +284,8 @@ typedef int (*mblen_converter) (const unsigned char *mbstr);
 typedef int (*mbdisplaylen_converter) (const unsigned char *mbstr);
 typedef bool (*mbcharacter_incrementer) (unsigned char *mbstr, int len);
 typedef int (*mbverifier) (const unsigned char *mbstr, int len);
 typedef struct
@@ -389,6 +391,7 @@ extern int pg_encoding_mbcliplen(int encoding, const char *mbstr,
 extern int	pg_mbcharcliplen(const char *mbstr, int len, int imit);
 extern int	pg_encoding_max_length(int encoding);
 extern int	pg_database_encoding_max_length(void);
 extern mbcharacter_incrementer pg_database_encoding_character_incrementer(void);
 extern int	PrepareClientEncoding(int encoding);
 extern int	SetClientEncoding(int encoding);