diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c index 107d85fa8b8..f5660b2c3cd 100644 --- a/src/backend/utils/adt/selfuncs.c +++ b/src/backend/utils/adt/selfuncs.c @@ -5701,13 +5701,23 @@ byte_increment(unsigned char *ptr, int len) * and "9" is seen as largest by the collation, and append that to the given * prefix before trying to find a string that compares as larger. * - * If we max out the righthand byte, truncate off the last character - * and start incrementing the next. For example, if "z" were the last - * character in the sort order, then we could produce "foo" as a - * string greater than "fonz". + * To search for a greater string, we repeatedly "increment" the rightmost + * character, using an encoding-specific character incrementer function. + * When it's no longer possible to increment the last character, we truncate + * off that character and start incrementing the next-to-rightmost. + * For example, if "z" were the last character in the sort order, then we + * could produce "foo" as a string greater than "fonz". * * This could be rather slow in the worst case, but in most cases we * won't have to try more than one or two strings before succeeding. + * + * Note that it's important for the character incrementer not to be too anal + * about producing every possible character code, since in some cases the only + * way to get a larger string is to increment a previous character position. + * So we don't want to spend too much time trying every possible character + * code at the last position. A good rule of thumb is to be sure that we + * don't try more than 256*K values for a K-byte character (and definitely + * not 256^K, which is what an exhaustive search would approach). */ Const * make_greater_string(const Const *str_const, FmgrInfo *ltproc, Oid collation) @@ -5779,17 +5789,19 @@ make_greater_string(const Const *str_const, FmgrInfo *ltproc, Oid collation) } } + /* Select appropriate character-incrementer function */ if (datatype == BYTEAOID) - charinc = &byte_increment; + charinc = byte_increment; else charinc = pg_database_encoding_character_incrementer(); + /* And search ... */ while (len > 0) { - int charlen; + int charlen; unsigned char *lastchar; - Const *workstr_const; + /* Identify the last character --- for bytea, just the last byte */ if (datatype == BYTEAOID) charlen = 1; else @@ -5799,9 +5811,15 @@ make_greater_string(const Const *str_const, FmgrInfo *ltproc, Oid collation) /* * Try to generate a larger string by incrementing the last character * (for BYTEA, we treat each byte as a character). + * + * Note: the incrementer function is expected to return true if it's + * generated a valid-per-the-encoding new character, otherwise false. + * The contents of the character on false return are unspecified. */ - if (charinc(lastchar, charlen)) + while (charinc(lastchar, charlen)) { + Const *workstr_const; + if (datatype == BYTEAOID) workstr_const = string_to_bytea_const(workstr, len); else @@ -5825,7 +5843,8 @@ make_greater_string(const Const *str_const, FmgrInfo *ltproc, Oid collation) } /* - * Truncate off the last character or byte. + * No luck here, so truncate off the last character and try to + * increment the next one. */ len -= charlen; workstr[len] = '\0'; diff --git a/src/backend/utils/mb/wchar.c b/src/backend/utils/mb/wchar.c index 39f6efc2412..10c46654e0a 100644 --- a/src/backend/utils/mb/wchar.c +++ b/src/backend/utils/mb/wchar.c @@ -1337,85 +1337,78 @@ pg_utf8_islegal(const unsigned char *source, int length) #ifndef FRONTEND /* - * Generic character increment function. + * Generic character incrementer function. * * Not knowing anything about the properties of the encoding in use, we just - * keep incrementing the last byte until pg_verifymbstr() likes the result, - * or we run out of values to try. - * - * Like all character-increment functions, we must restore the original input - * string on failure. + * keep incrementing the last byte until we get a validly-encoded result, + * or we run out of values to try. We don't bother to try incrementing + * higher-order bytes, so there's no growth in runtime for wider characters. + * (If we did try to do that, we'd need to consider the likelihood that 255 + * is not a valid final byte in the encoding.) */ static bool pg_generic_charinc(unsigned char *charptr, int len) { - unsigned char *lastchar = (unsigned char *) (charptr + len - 1); - unsigned char savelastchar = *lastchar; - const char *const_charptr = (const char *)charptr; - - while (*lastchar < (unsigned char) 255) - { - (*lastchar)++; - if (!pg_verifymbstr(const_charptr, len, true)) - continue; - return true; - } - - *lastchar = savelastchar; - return false; + unsigned char *lastbyte = charptr + len - 1; + mbverifier mbverify; + + /* We can just invoke the character verifier directly. */ + mbverify = pg_wchar_table[GetDatabaseEncoding()].mbverify; + + while (*lastbyte < (unsigned char) 255) + { + (*lastbyte)++; + if ((*mbverify) (charptr, len) == len) + return true; + } + + return false; } /* - * UTF-8 character increment function. + * UTF-8 character incrementer function. * * For a one-byte character less than 0x7F, we just increment the byte. * * For a multibyte character, every byte but the first must fall between 0x80 * and 0xBF; and the first byte must be between 0xC0 and 0xF4. We increment - * the last byte that's not already at its maximum value, and set any following - * bytes back to 0x80. If we can't find a byte that's less than the maximum - * allowable vale, we simply fail. We also have some special-case logic to - * skip regions used for surrogate pair handling, as those should not occur in - * valid UTF-8. + * the last byte that's not already at its maximum value. If we can't find a + * byte that's less than the maximum allowable value, we simply fail. We also + * need some special-case logic to skip regions used for surrogate pair + * handling, as those should not occur in valid UTF-8. * - * Like all character-increment functions, we must restore the original input - * string on failure. + * Note that we don't reset lower-order bytes back to their minimums, since + * we can't afford to make an exhaustive search (see make_greater_string). */ static bool pg_utf8_increment(unsigned char *charptr, int length) { - unsigned char a; - unsigned char bak[4]; + unsigned char a; unsigned char limit; - switch (length) - { - default: - /* reject lengths 5 and 6 for now */ - return false; - case 4: - bak[3] = charptr[3]; - a = charptr[3]; - if (a < 0xBF) - { - charptr[3]++; - break; - } - charptr[3] = 0x80; - /* FALL THRU */ - case 3: - bak[2] = charptr[2]; - a = charptr[2]; - if (a < 0xBF) - { - charptr[2]++; - break; - } - charptr[2] = 0x80; - /* FALL THRU */ - case 2: - bak[1] = charptr[1]; - a = charptr[1]; + switch (length) + { + default: + /* reject lengths 5 and 6 for now */ + return false; + case 4: + a = charptr[3]; + if (a < 0xBF) + { + charptr[3]++; + break; + } + /* FALL THRU */ + case 3: + a = charptr[2]; + if (a < 0xBF) + { + charptr[2]++; + break; + } + /* FALL THRU */ + case 2: + a = charptr[1]; switch (*charptr) { case 0xED: @@ -1430,147 +1423,126 @@ pg_utf8_increment(unsigned char *charptr, int length) } if (a < limit) { - charptr[1]++; - break; - } - charptr[1] = 0x80; - /* FALL THRU */ - case 1: - bak[0] = *charptr; - a = *charptr; - if (a == 0x7F || a == 0xDF || a == 0xEF || a == 0xF4) - { - /* Restore original string. */ - memcpy(charptr, bak, length); - return false; - } - charptr[0]++; - break; - } + charptr[1]++; + break; + } + /* FALL THRU */ + case 1: + a = *charptr; + if (a == 0x7F || a == 0xDF || a == 0xEF || a == 0xF4) + return false; + charptr[0]++; + break; + } - return true; + return true; } /* - * EUC-JP character increment function. + * EUC-JP character incrementer function. * - * If the sequence starts with SS2(0x8e), it must be a two-byte sequence - * representing JIS X 0201 characters with the second byte ranges between - * 0xa1 and 0xde. We just increment the last byte if it's less than 0xde, - * and otherwise rewrite whole the sequence to 0xa1 0xa1. + * If the sequence starts with SS2 (0x8e), it must be a two-byte sequence + * representing JIS X 0201 characters with the second byte ranging between + * 0xa1 and 0xdf. We just increment the last byte if it's less than 0xdf, + * and otherwise rewrite the whole sequence to 0xa1 0xa1. * - * If the sequence starts with SS3(0x8f), it must be a three-byte sequence - * which the last two bytes ranges between 0xa1 and 0xfe. The last byte - * is incremented, carrying overflow to the second-to-last byte. + * If the sequence starts with SS3 (0x8f), it must be a three-byte sequence + * in which the last two bytes range between 0xa1 and 0xfe. The last byte + * is incremented if possible, otherwise the second-to-last byte. * - * If the sequence starts with the values other than the aboves and its MSB + * If the sequence starts with a value other than the above and its MSB * is set, it must be a two-byte sequence representing JIS X 0208 characters - * with both bytes ranges between 0xa1 and 0xfe. The last byte is incremented, - * carrying overflow to the second-to-last byte. + * with both bytes ranging between 0xa1 and 0xfe. The last byte is + * incremented if possible, otherwise the second-to-last byte. * - * Otherwise the sequence is consists of single byte representing ASCII - * characters. It is incremented up to 0x7f. - * - * Only three EUC-JP byte sequences shown below - which have no character - * allocated - make this function to fail in spite of its validity: 0x7f, - * 0xfe 0xfe, 0x8f 0xfe 0xfe. + * Otherwise, the sequence is a single-byte ASCII character. It is + * incremented up to 0x7f. */ static bool pg_eucjp_increment(unsigned char *charptr, int length) { - unsigned char bak[3]; - unsigned char c1, c2; - signed int i; + unsigned char c1, + c2; + int i; - c1 = *charptr; + c1 = *charptr; - switch (c1) - { - case SS2: /* JIS X 0201 */ - if (length != 2) + switch (c1) + { + case SS2: /* JIS X 0201 */ + if (length != 2) return false; - c2 = charptr[1]; + c2 = charptr[1]; - if (c2 > 0xde) - charptr[0] = charptr[1] = 0xa1; - else if (c2 < 0xa1) - charptr[1] = 0xa1; - else - charptr[1]++; + if (c2 >= 0xdf) + charptr[0] = charptr[1] = 0xa1; + else if (c2 < 0xa1) + charptr[1] = 0xa1; + else + charptr[1]++; + break; - break; - - case SS3: /* JIS X 0212 */ - if (length != 3) + case SS3: /* JIS X 0212 */ + if (length != 3) return false; - for (i = 2; i > 0; i--) - { - bak[i] = charptr[i]; - c2 = charptr[i]; - if (c2 < 0xa1) - { - charptr[i] = 0xa1; - return true; - } - else if (c2 < 0xfe) - { - charptr[i]++; - break; - } - charptr[i] = 0xa1; - } + for (i = 2; i > 0; i--) + { + c2 = charptr[i]; + if (c2 < 0xa1) + { + charptr[i] = 0xa1; + return true; + } + else if (c2 < 0xfe) + { + charptr[i]++; + return true; + } + } - if (i == 0) /* Out of 3-byte code region */ - { - charptr[1] = bak[1]; - charptr[2] = bak[2]; - return false; - } - break; + /* Out of 3-byte code region */ + return false; - default: - if (IS_HIGHBIT_SET(c1)) /* JIS X 0208? */ - { - if (length != 2) + default: + if (IS_HIGHBIT_SET(c1)) /* JIS X 0208? */ + { + if (length != 2) return false; - for (i = 1 ; i >= 0 ; i--) /* i must be signed */ - { - bak[i] = charptr[i]; - c2 = charptr[i]; - if (c2 < 0xa1) - { - charptr[i] = 0xa1; - return true; - } - else if (c2 < 0xfe) - { - charptr[i]++; - break; - } - charptr[i] = 0xa1; - } + for (i = 1; i >= 0; i--) + { + c2 = charptr[i]; + if (c2 < 0xa1) + { + charptr[i] = 0xa1; + return true; + } + else if (c2 < 0xfe) + { + charptr[i]++; + return true; + } + } - if (i < 0) /* Out of 2 byte code region */ - { - charptr[0] = bak[0]; - charptr[1] = bak[1]; - return false; - } - } - else - { /* ASCII, single byte */ - if (c1 > 0x7e) - return false; - (*charptr)++; - } - } + /* Out of 2 byte code region */ + return false; + } + else + { /* ASCII, single byte */ + if (c1 > 0x7e) + return false; + (*charptr)++; + } + break; + } - return true; + return true; } -#endif + +#endif /* !FRONTEND */ + /* *------------------------------------------------------------------- @@ -1697,19 +1669,23 @@ pg_database_encoding_max_length(void) } /* - * give the character incrementer for the encoding for the current database + * get the character incrementer for the encoding for the current database */ mbcharacter_incrementer pg_database_encoding_character_incrementer(void) { + /* + * Eventually it might be best to add a field to pg_wchar_table[], + * but for now we just use a switch. + */ switch (GetDatabaseEncoding()) { case PG_UTF8: return pg_utf8_increment; - + case PG_EUC_JP: return pg_eucjp_increment; - + default: return pg_generic_charinc; } @@ -1908,4 +1884,4 @@ report_untranslatable_char(int src_encoding, int dest_encoding, pg_enc2name_tbl[dest_encoding].name))); } -#endif +#endif /* !FRONTEND */