diff --git a/doc/src/sgml/datatype.sgml b/doc/src/sgml/datatype.sgml index f70cfe75e9e..7c3ef92cd2e 100644 --- a/doc/src/sgml/datatype.sgml +++ b/doc/src/sgml/datatype.sgml @@ -1140,8 +1140,7 @@ SELECT '52093.89'::money::numeric::float8; advantages in some other database systems, there is no such advantage in PostgreSQL; in fact character(n) is usually the slowest of - the three because of its additional storage costs and slower - sorting. In most situations + the three because of its additional storage costs. In most situations text or character varying should be used instead. diff --git a/src/backend/utils/adt/varchar.c b/src/backend/utils/adt/varchar.c index 0498fef4049..94d6da5eb57 100644 --- a/src/backend/utils/adt/varchar.c +++ b/src/backend/utils/adt/varchar.c @@ -17,6 +17,7 @@ #include "access/hash.h" #include "access/tuptoaster.h" +#include "catalog/pg_collation.h" #include "libpq/pqformat.h" #include "nodes/nodeFuncs.h" #include "utils/array.h" @@ -649,14 +650,21 @@ varchartypmodout(PG_FUNCTION_ARGS) *****************************************************************************/ /* "True" length (not counting trailing blanks) of a BpChar */ -static int +static inline int bcTruelen(BpChar *arg) { - char *s = VARDATA_ANY(arg); - int i; - int len; + return bpchartruelen(VARDATA_ANY(arg), VARSIZE_ANY_EXHDR(arg)); +} - len = VARSIZE_ANY_EXHDR(arg); +int +bpchartruelen(char *s, int len) +{ + int i; + + /* + * Note that we rely on the assumption that ' ' is a singleton unit on + * every supported multibyte server encoding. + */ for (i = len - 1; i >= 0; i--) { if (s[i] != ' ') @@ -858,6 +866,23 @@ bpcharcmp(PG_FUNCTION_ARGS) PG_RETURN_INT32(cmp); } +Datum +bpchar_sortsupport(PG_FUNCTION_ARGS) +{ + SortSupport ssup = (SortSupport) PG_GETARG_POINTER(0); + Oid collid = ssup->ssup_collation; + MemoryContext oldcontext; + + oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt); + + /* Use generic string SortSupport */ + varstr_sortsupport(ssup, collid, true); + + MemoryContextSwitchTo(oldcontext); + + PG_RETURN_VOID(); +} + Datum bpchar_larger(PG_FUNCTION_ARGS) { @@ -926,8 +951,9 @@ hashbpchar(PG_FUNCTION_ARGS) /* * The following operators support character-by-character comparison * of bpchar datums, to allow building indexes suitable for LIKE clauses. - * Note that the regular bpchareq/bpcharne comparison operators are assumed - * to be compatible with these! + * Note that the regular bpchareq/bpcharne comparison operators, and + * regular support functions 1 and 2 with "C" collation are assumed to be + * compatible with these! */ static int @@ -1030,3 +1056,20 @@ btbpchar_pattern_cmp(PG_FUNCTION_ARGS) PG_RETURN_INT32(result); } + + +Datum +btbpchar_pattern_sortsupport(PG_FUNCTION_ARGS) +{ + SortSupport ssup = (SortSupport) PG_GETARG_POINTER(0); + MemoryContext oldcontext; + + oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt); + + /* Use generic string SortSupport, forcing "C" collation */ + varstr_sortsupport(ssup, C_COLLATION_OID, true); + + MemoryContextSwitchTo(oldcontext); + + PG_RETURN_VOID(); +} diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c index 8683188df13..1a74e5e93c8 100644 --- a/src/backend/utils/adt/varlena.c +++ b/src/backend/utils/adt/varlena.c @@ -40,6 +40,7 @@ int bytea_output = BYTEA_OUTPUT_HEX; typedef struct varlena unknown; +typedef struct varlena string; typedef struct { @@ -67,13 +68,14 @@ typedef struct int last_returned; /* Last comparison result (cache) */ bool cache_blob; /* Does buf2 contain strxfrm() blob, etc? */ bool collate_c; + bool bpchar; /* Sorting pbchar, not varchar/text/bytea? */ hyperLogLogState abbr_card; /* Abbreviated key cardinality state */ hyperLogLogState full_card; /* Full key cardinality state */ double prop_card; /* Required cardinality proportion */ #ifdef HAVE_LOCALE_T pg_locale_t locale; #endif -} TextSortSupport; +} StringSortSupport; /* * This should be large enough that most strings will fit, but small enough @@ -87,12 +89,15 @@ typedef struct #define PG_GETARG_UNKNOWN_P_COPY(n) DatumGetUnknownPCopy(PG_GETARG_DATUM(n)) #define PG_RETURN_UNKNOWN_P(x) PG_RETURN_POINTER(x) -static void btsortsupport_worker(SortSupport ssup, Oid collid); -static int bttextfastcmp_c(Datum x, Datum y, SortSupport ssup); -static int bttextfastcmp_locale(Datum x, Datum y, SortSupport ssup); -static int bttextcmp_abbrev(Datum x, Datum y, SortSupport ssup); -static Datum bttext_abbrev_convert(Datum original, SortSupport ssup); -static bool bttext_abbrev_abort(int memtupcount, SortSupport ssup); +#define DatumGetStringP(X) ((string *) PG_DETOAST_DATUM(X)) +#define DatumGetStringPP(X) ((string *) PG_DETOAST_DATUM_PACKED(X)) + +static int varstrfastcmp_c(Datum x, Datum y, SortSupport ssup); +static int bpcharfastcmp_c(Datum x, Datum y, SortSupport ssup); +static int varstrfastcmp_locale(Datum x, Datum y, SortSupport ssup); +static int varstrcmp_abbrev(Datum x, Datum y, SortSupport ssup); +static Datum varstr_abbrev_convert(Datum original, SortSupport ssup); +static bool varstr_abbrev_abort(int memtupcount, SortSupport ssup); static int32 text_length(Datum str); static text *text_catenate(text *t1, text *t2); static text *text_substring(Datum str, @@ -1738,19 +1743,30 @@ bttextsortsupport(PG_FUNCTION_ARGS) oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt); - btsortsupport_worker(ssup, collid); + /* Use generic string SortSupport */ + varstr_sortsupport(ssup, collid, false); MemoryContextSwitchTo(oldcontext); PG_RETURN_VOID(); } -static void -btsortsupport_worker(SortSupport ssup, Oid collid) +/* + * Generic sortsupport interface for character type's operator classes. + * Includes locale support, and support for BpChar semantics (i.e. removing + * trailing spaces before comparison). + * + * Relies on the assumption that text, VarChar, BpChar, and bytea all have the + * same representation. Callers that always use the C collation (e.g. + * non-collatable type callers like bytea) may have NUL bytes in their strings; + * this will not work with any other collation, though. + */ +void +varstr_sortsupport(SortSupport ssup, Oid collid, bool bpchar) { bool abbreviate = ssup->abbreviate; bool collate_c = false; - TextSortSupport *tss; + StringSortSupport *sss; #ifdef HAVE_LOCALE_T pg_locale_t locale = 0; @@ -1762,20 +1778,25 @@ btsortsupport_worker(SortSupport ssup, Oid collid) * overhead of a trip through the fmgr layer for every comparison, which * can be substantial. * - * Most typically, we'll set the comparator to bttextfastcmp_locale, which - * uses strcoll() to perform comparisons. However, if LC_COLLATE = C, we - * can make things quite a bit faster with bttextfastcmp_c, which uses - * memcmp() rather than strcoll(). + * Most typically, we'll set the comparator to varstrfastcmp_locale, which + * uses strcoll() to perform comparisons and knows about the special + * requirements of BpChar callers. However, if LC_COLLATE = C, we can make + * things quite a bit faster with varstrfastcmp_c or bpcharfastcmp_c, + * both of which use memcmp() rather than strcoll(). * * There is a further exception on Windows. When the database encoding is * UTF-8 and we are not using the C collation, complex hacks are required. * We don't currently have a comparator that handles that case, so we fall - * back on the slow method of having the sort code invoke bttextcmp() via - * the fmgr trampoline. + * back on the slow method of having the sort code invoke bttextcmp() (in + * the case of text) via the fmgr trampoline. */ if (lc_collate_is_c(collid)) { - ssup->comparator = bttextfastcmp_c; + if (!bpchar) + ssup->comparator = varstrfastcmp_c; + else + ssup->comparator = bpcharfastcmp_c; + collate_c = true; } #ifdef WIN32 @@ -1784,7 +1805,7 @@ btsortsupport_worker(SortSupport ssup, Oid collid) #endif else { - ssup->comparator = bttextfastcmp_locale; + ssup->comparator = varstrfastcmp_locale; /* * We need a collation-sensitive comparison. To make things faster, @@ -1825,24 +1846,25 @@ btsortsupport_worker(SortSupport ssup, Oid collid) /* * If we're using abbreviated keys, or if we're using a locale-aware - * comparison, we need to initialize a TextSortSupport object. Both cases - * will make use of the temporary buffers we initialize here for scratch - * space, and the abbreviation case requires additional state. + * comparison, we need to initialize a StringSortSupport object. Both + * cases will make use of the temporary buffers we initialize here for + * scratch space (and to detect requirement for BpChar semantics from + * caller), and the abbreviation case requires additional state. */ if (abbreviate || !collate_c) { - tss = palloc(sizeof(TextSortSupport)); - tss->buf1 = palloc(TEXTBUFLEN); - tss->buflen1 = TEXTBUFLEN; - tss->buf2 = palloc(TEXTBUFLEN); - tss->buflen2 = TEXTBUFLEN; + sss = palloc(sizeof(StringSortSupport)); + sss->buf1 = palloc(TEXTBUFLEN); + sss->buflen1 = TEXTBUFLEN; + sss->buf2 = palloc(TEXTBUFLEN); + sss->buflen2 = TEXTBUFLEN; /* Start with invalid values */ - tss->last_len1 = -1; - tss->last_len2 = -1; + sss->last_len1 = -1; + sss->last_len2 = -1; /* Initialize */ - tss->last_returned = 0; + sss->last_returned = 0; #ifdef HAVE_LOCALE_T - tss->locale = locale; + sss->locale = locale; #endif /* * To avoid somehow confusing a strxfrm() blob and an original string, @@ -1858,9 +1880,10 @@ btsortsupport_worker(SortSupport ssup, Oid collid) * * Arbitrarily initialize cache_blob to true. */ - tss->cache_blob = true; - tss->collate_c = collate_c; - ssup->ssup_extra = tss; + sss->cache_blob = true; + sss->collate_c = collate_c; + sss->bpchar = bpchar; + ssup->ssup_extra = sss; /* * If possible, plan to use the abbreviated keys optimization. The @@ -1869,13 +1892,13 @@ btsortsupport_worker(SortSupport ssup, Oid collid) */ if (abbreviate) { - tss->prop_card = 0.20; - initHyperLogLog(&tss->abbr_card, 10); - initHyperLogLog(&tss->full_card, 10); + sss->prop_card = 0.20; + initHyperLogLog(&sss->abbr_card, 10); + initHyperLogLog(&sss->full_card, 10); ssup->abbrev_full_comparator = ssup->comparator; - ssup->comparator = bttextcmp_abbrev; - ssup->abbrev_converter = bttext_abbrev_convert; - ssup->abbrev_abort = bttext_abbrev_abort; + ssup->comparator = varstrcmp_abbrev; + ssup->abbrev_converter = varstr_abbrev_convert; + ssup->abbrev_abort = varstr_abbrev_abort; } } } @@ -1884,10 +1907,10 @@ btsortsupport_worker(SortSupport ssup, Oid collid) * sortsupport comparison func (for C locale case) */ static int -bttextfastcmp_c(Datum x, Datum y, SortSupport ssup) +varstrfastcmp_c(Datum x, Datum y, SortSupport ssup) { - text *arg1 = DatumGetTextPP(x); - text *arg2 = DatumGetTextPP(y); + string *arg1 = DatumGetStringPP(x); + string *arg2 = DatumGetStringPP(y); char *a1p, *a2p; int len1, @@ -1913,16 +1936,53 @@ bttextfastcmp_c(Datum x, Datum y, SortSupport ssup) return result; } +/* + * sortsupport comparison func (for BpChar C locale case) + * + * BpChar outsources its sortsupport to this module. Specialization for the + * varstr_sortsupport BpChar case, modeled on + * internal_bpchar_pattern_compare(). + */ +static int +bpcharfastcmp_c(Datum x, Datum y, SortSupport ssup) +{ + BpChar *arg1 = DatumGetBpCharPP(x); + BpChar *arg2 = DatumGetBpCharPP(y); + char *a1p, + *a2p; + int len1, + len2, + result; + + a1p = VARDATA_ANY(arg1); + a2p = VARDATA_ANY(arg2); + + len1 = bpchartruelen(a1p, VARSIZE_ANY_EXHDR(arg1)); + len2 = bpchartruelen(a2p, VARSIZE_ANY_EXHDR(arg2)); + + result = memcmp(a1p, a2p, Min(len1, len2)); + if ((result == 0) && (len1 != len2)) + result = (len1 < len2) ? -1 : 1; + + /* We can't afford to leak memory here. */ + if (PointerGetDatum(arg1) != x) + pfree(arg1); + if (PointerGetDatum(arg2) != y) + pfree(arg2); + + return result; +} + /* * sortsupport comparison func (for locale case) */ static int -bttextfastcmp_locale(Datum x, Datum y, SortSupport ssup) +varstrfastcmp_locale(Datum x, Datum y, SortSupport ssup) { - text *arg1 = DatumGetTextPP(x); - text *arg2 = DatumGetTextPP(y); + string *arg1 = DatumGetStringPP(x); + string *arg2 = DatumGetStringPP(y); bool arg1_match; - TextSortSupport *tss = (TextSortSupport *) ssup->ssup_extra; + StringSortSupport *sss = (StringSortSupport *) ssup->ssup_extra; /* working state */ char *a1p, @@ -1944,41 +2004,56 @@ bttextfastcmp_locale(Datum x, Datum y, SortSupport ssup) * No change in buf1 or buf2 contents, so avoid changing last_len1 or * last_len2. Existing contents of buffers might still be used by next * call. + * + * It's fine to allow the comparison of BpChar padding bytes here, even + * though that implies that the memcmp() will usually be performed for + * BpChar callers (though multibyte characters could still prevent that + * from occurring). The memcmp() is still very cheap, and BpChar's + * funny semantics have us remove trailing spaces (not limited to + * padding), so we need make no distinction between padding space + * characters and "real" space characters. */ result = 0; goto done; } - if (len1 >= tss->buflen1) + if (sss->bpchar) { - pfree(tss->buf1); - tss->buflen1 = Max(len1 + 1, Min(tss->buflen1 * 2, MaxAllocSize)); - tss->buf1 = MemoryContextAlloc(ssup->ssup_cxt, tss->buflen1); + /* Get true number of bytes, ignoring trailing spaces */ + len1 = bpchartruelen(a1p, len1); + len2 = bpchartruelen(a2p, len2); } - if (len2 >= tss->buflen2) + + if (len1 >= sss->buflen1) { - pfree(tss->buf2); - tss->buflen2 = Max(len2 + 1, Min(tss->buflen2 * 2, MaxAllocSize)); - tss->buf2 = MemoryContextAlloc(ssup->ssup_cxt, tss->buflen2); + pfree(sss->buf1); + sss->buflen1 = Max(len1 + 1, Min(sss->buflen1 * 2, MaxAllocSize)); + sss->buf1 = MemoryContextAlloc(ssup->ssup_cxt, sss->buflen1); + } + if (len2 >= sss->buflen2) + { + pfree(sss->buf2); + sss->buflen2 = Max(len2 + 1, Min(sss->buflen2 * 2, MaxAllocSize)); + sss->buf2 = MemoryContextAlloc(ssup->ssup_cxt, sss->buflen2); } /* * We're likely to be asked to compare the same strings repeatedly, and * memcmp() is so much cheaper than strcoll() that it pays to try to cache * comparisons, even though in general there is no reason to think that - * that will work out (every text datum may be unique). Caching does not + * that will work out (every string datum may be unique). Caching does not * slow things down measurably when it doesn't work out, and can speed * things up by rather a lot when it does. In part, this is because the * memcmp() compares data from cachelines that are needed in L1 cache even * when the last comparison's result cannot be reused. */ arg1_match = true; - if (len1 != tss->last_len1 || memcmp(tss->buf1, a1p, len1) != 0) + if (len1 != sss->last_len1 || memcmp(sss->buf1, a1p, len1) != 0) { arg1_match = false; - memcpy(tss->buf1, a1p, len1); - tss->buf1[len1] = '\0'; - tss->last_len1 = len1; + memcpy(sss->buf1, a1p, len1); + sss->buf1[len1] = '\0'; + sss->last_len1 = len1; } /* @@ -1987,25 +2062,25 @@ bttextfastcmp_locale(Datum x, Datum y, SortSupport ssup) * it seems (at least with moderate to low cardinality sets), because * quicksort compares the same pivot against many values. */ - if (len2 != tss->last_len2 || memcmp(tss->buf2, a2p, len2) != 0) + if (len2 != sss->last_len2 || memcmp(sss->buf2, a2p, len2) != 0) { - memcpy(tss->buf2, a2p, len2); - tss->buf2[len2] = '\0'; - tss->last_len2 = len2; + memcpy(sss->buf2, a2p, len2); + sss->buf2[len2] = '\0'; + sss->last_len2 = len2; } - else if (arg1_match && !tss->cache_blob) + else if (arg1_match && !sss->cache_blob) { /* Use result cached following last actual strcoll() call */ - result = tss->last_returned; + result = sss->last_returned; goto done; } #ifdef HAVE_LOCALE_T - if (tss->locale) - result = strcoll_l(tss->buf1, tss->buf2, tss->locale); + if (sss->locale) + result = strcoll_l(sss->buf1, sss->buf2, sss->locale); else #endif - result = strcoll(tss->buf1, tss->buf2); + result = strcoll(sss->buf1, sss->buf2); /* * In some locales strcoll() can claim that nonidentical strings are @@ -2013,11 +2088,11 @@ bttextfastcmp_locale(Datum x, Datum y, SortSupport ssup) * follow Perl's lead and sort "equal" strings according to strcmp(). */ if (result == 0) - result = strcmp(tss->buf1, tss->buf2); + result = strcmp(sss->buf1, sss->buf2); /* Cache result, perhaps saving an expensive strcoll() call next time */ - tss->cache_blob = false; - tss->last_returned = result; + sss->cache_blob = false; + sss->last_returned = result; done: /* We can't afford to leak memory here. */ if (PointerGetDatum(arg1) != x) @@ -2032,13 +2107,14 @@ done: * Abbreviated key comparison func */ static int -bttextcmp_abbrev(Datum x, Datum y, SortSupport ssup) +varstrcmp_abbrev(Datum x, Datum y, SortSupport ssup) { /* - * When 0 is returned, the core system will call bttextfastcmp_c() or - * bttextfastcmp_locale(). Even a strcmp() on two non-truncated strxfrm() - * blobs cannot indicate *equality* authoritatively, for the same reason - * that there is a strcoll() tie-breaker call to strcmp() in varstr_cmp(). + * When 0 is returned, the core system will call varstrfastcmp_c() + * (bpcharfastcmp_c() in BpChar case) or varstrfastcmp_locale(). Even a + * strcmp() on two non-truncated strxfrm() blobs cannot indicate *equality* + * authoritatively, for the same reason that there is a strcoll() + * tie-breaker call to strcmp() in varstr_cmp(). */ if (x > y) return 1; @@ -2049,16 +2125,17 @@ bttextcmp_abbrev(Datum x, Datum y, SortSupport ssup) } /* - * Conversion routine for sortsupport. Converts original text to abbreviated - * key representation. Our encoding strategy is simple -- pack the first 8 - * bytes of a strxfrm() blob into a Datum (on little-endian machines, the 8 - * bytes are stored in reverse order), and treat it as an unsigned integer. + * Conversion routine for sortsupport. Converts original to abbreviated key + * representation. Our encoding strategy is simple -- pack the first 8 bytes + * of a strxfrm() blob into a Datum (on little-endian machines, the 8 bytes are + * stored in reverse order), and treat it as an unsigned integer. When the "C" + * locale is used, or in case of bytea, just memcpy() from original instead. */ static Datum -bttext_abbrev_convert(Datum original, SortSupport ssup) +varstr_abbrev_convert(Datum original, SortSupport ssup) { - TextSortSupport *tss = (TextSortSupport *) ssup->ssup_extra; - text *authoritative = DatumGetTextPP(original); + StringSortSupport *sss = (StringSortSupport *) ssup->ssup_extra; + string *authoritative = DatumGetStringPP(original); char *authoritative_data = VARDATA_ANY(authoritative); /* working state */ @@ -2072,13 +2149,38 @@ bttext_abbrev_convert(Datum original, SortSupport ssup) memset(pres, 0, sizeof(Datum)); len = VARSIZE_ANY_EXHDR(authoritative); + /* Get number of bytes, ignoring trailing spaces */ + if (sss->bpchar) + len = bpchartruelen(authoritative_data, len); + /* * If we're using the C collation, use memcmp(), rather than strxfrm(), to * abbreviate keys. The full comparator for the C locale is always - * memcmp(), and we can't risk having this give a different answer. - * Besides, this should be faster, too. + * memcmp(). It would be incorrect to allow bytea callers (callers that + * always force the C collation -- bytea isn't a collatable type, but this + * approach is convenient) to use strxfrm(). This is because bytea strings + * may contain NUL bytes. Besides, this should be faster, too. + * + * More generally, it's okay that bytea callers can have NUL bytes in + * strings because varstrcmp_abbrev() need not make a distinction between + * terminating NUL bytes, and NUL bytes representing actual NULs in the + * authoritative representation. Hopefully a comparison at or past one + * abbreviated key's terminating NUL byte will resolve the comparison + * without consulting the authoritative representation; specifically, some + * later non-NUL byte in the longer string can resolve the comparison + * against a subsequent terminating NUL in the shorter string. There will + * usually be what is effectively a "length-wise" resolution there and + * then. + * + * If that doesn't work out -- if all bytes in the longer string positioned + * at or past the offset of the smaller string's (first) terminating NUL + * are actually representative of NUL bytes in the authoritative binary + * string (perhaps with some *terminating* NUL bytes towards the end of the + * longer string iff it happens to still be small) -- then an authoritative + * tie-breaker will happen, and do the right thing: explicitly consider + * string length. */ - if (tss->collate_c) + if (sss->collate_c) memcpy(pres, authoritative_data, Min(len, sizeof(Datum))); else { @@ -2088,50 +2190,50 @@ bttext_abbrev_convert(Datum original, SortSupport ssup) * We're not using the C collation, so fall back on strxfrm. */ - /* By convention, we use buffer 1 to store and NUL-terminate text */ - if (len >= tss->buflen1) + /* By convention, we use buffer 1 to store and NUL-terminate */ + if (len >= sss->buflen1) { - pfree(tss->buf1); - tss->buflen1 = Max(len + 1, Min(tss->buflen1 * 2, MaxAllocSize)); - tss->buf1 = palloc(tss->buflen1); + pfree(sss->buf1); + sss->buflen1 = Max(len + 1, Min(sss->buflen1 * 2, MaxAllocSize)); + sss->buf1 = palloc(sss->buflen1); } /* Might be able to reuse strxfrm() blob from last call */ - if (tss->last_len1 == len && tss->cache_blob && - memcmp(tss->buf1, authoritative_data, len) == 0) + if (sss->last_len1 == len && sss->cache_blob && + memcmp(sss->buf1, authoritative_data, len) == 0) { - memcpy(pres, tss->buf2, Min(sizeof(Datum), tss->last_len2)); + memcpy(pres, sss->buf2, Min(sizeof(Datum), sss->last_len2)); /* No change affecting cardinality, so no hashing required */ goto done; } /* Just like strcoll(), strxfrm() expects a NUL-terminated string */ - memcpy(tss->buf1, authoritative_data, len); - tss->buf1[len] = '\0'; - tss->last_len1 = len; + memcpy(sss->buf1, authoritative_data, len); + sss->buf1[len] = '\0'; + sss->last_len1 = len; for (;;) { #ifdef HAVE_LOCALE_T - if (tss->locale) - bsize = strxfrm_l(tss->buf2, tss->buf1, - tss->buflen2, tss->locale); + if (sss->locale) + bsize = strxfrm_l(sss->buf2, sss->buf1, + sss->buflen2, sss->locale); else #endif - bsize = strxfrm(tss->buf2, tss->buf1, tss->buflen2); + bsize = strxfrm(sss->buf2, sss->buf1, sss->buflen2); - tss->last_len2 = bsize; - if (bsize < tss->buflen2) + sss->last_len2 = bsize; + if (bsize < sss->buflen2) break; /* * The C standard states that the contents of the buffer is now * unspecified. Grow buffer, and retry. */ - pfree(tss->buf2); - tss->buflen2 = Max(bsize + 1, - Min(tss->buflen2 * 2, MaxAllocSize)); - tss->buf2 = palloc(tss->buflen2); + pfree(sss->buf2); + sss->buflen2 = Max(bsize + 1, + Min(sss->buflen2 * 2, MaxAllocSize)); + sss->buf2 = palloc(sss->buflen2); } /* @@ -2139,8 +2241,11 @@ bttext_abbrev_convert(Datum original, SortSupport ssup) * strxfrm() blob is itself NUL terminated, leaving no danger of * misinterpreting any NUL bytes not intended to be interpreted as * logically representing termination. + * + * (Actually, even if there were NUL bytes in the blob it would be + * okay. See remarks on bytea case above.) */ - memcpy(pres, tss->buf2, Min(sizeof(Datum), bsize)); + memcpy(pres, sss->buf2, Min(sizeof(Datum), bsize)); } /* @@ -2148,7 +2253,7 @@ bttext_abbrev_convert(Datum original, SortSupport ssup) * authoritative keys using HyperLogLog. Used as cheap insurance against * the worst case, where we do many string transformations for no saving * in full strcoll()-based comparisons. These statistics are used by - * bttext_abbrev_abort(). + * varstr_abbrev_abort(). * * First, Hash key proper, or a significant fraction of it. Mix in length * in order to compensate for cases where differences are past @@ -2160,7 +2265,7 @@ bttext_abbrev_convert(Datum original, SortSupport ssup) if (len > PG_CACHE_LINE_SIZE) hash ^= DatumGetUInt32(hash_uint32((uint32) len)); - addHyperLogLog(&tss->full_card, hash); + addHyperLogLog(&sss->full_card, hash); /* Hash abbreviated key */ #if SIZEOF_DATUM == 8 @@ -2176,15 +2281,15 @@ bttext_abbrev_convert(Datum original, SortSupport ssup) hash = DatumGetUInt32(hash_uint32((uint32) res)); #endif - addHyperLogLog(&tss->abbr_card, hash); + addHyperLogLog(&sss->abbr_card, hash); /* Cache result, perhaps saving an expensive strxfrm() call next time */ - tss->cache_blob = true; + sss->cache_blob = true; done: /* * Byteswap on little-endian machines. * - * This is needed so that bttextcmp_abbrev() (an unsigned integer 3-way + * This is needed so that varstrcmp_abbrev() (an unsigned integer 3-way * comparator) works correctly on all platforms. If we didn't do this, * the comparator would have to call memcmp() with a pair of pointers to * the first byte of each abbreviated key, which is slower. @@ -2204,9 +2309,9 @@ done: * should be aborted, based on its projected effectiveness. */ static bool -bttext_abbrev_abort(int memtupcount, SortSupport ssup) +varstr_abbrev_abort(int memtupcount, SortSupport ssup) { - TextSortSupport *tss = (TextSortSupport *) ssup->ssup_extra; + StringSortSupport *sss = (StringSortSupport *) ssup->ssup_extra; double abbrev_distinct, key_distinct; @@ -2216,8 +2321,8 @@ bttext_abbrev_abort(int memtupcount, SortSupport ssup) if (memtupcount < 100) return false; - abbrev_distinct = estimateHyperLogLog(&tss->abbr_card); - key_distinct = estimateHyperLogLog(&tss->full_card); + abbrev_distinct = estimateHyperLogLog(&sss->abbr_card); + key_distinct = estimateHyperLogLog(&sss->full_card); /* * Clamp cardinality estimates to at least one distinct value. While @@ -2240,10 +2345,10 @@ bttext_abbrev_abort(int memtupcount, SortSupport ssup) { double norm_abbrev_card = abbrev_distinct / (double) memtupcount; - elog(LOG, "bttext_abbrev: abbrev_distinct after %d: %f " + elog(LOG, "varstr_abbrev: abbrev_distinct after %d: %f " "(key_distinct: %f, norm_abbrev_card: %f, prop_card: %f)", memtupcount, abbrev_distinct, key_distinct, norm_abbrev_card, - tss->prop_card); + sss->prop_card); } #endif @@ -2263,7 +2368,7 @@ bttext_abbrev_abort(int memtupcount, SortSupport ssup) * abbreviated comparison with a cheap memcmp()-based authoritative * resolution are equivalent. */ - if (abbrev_distinct > key_distinct * tss->prop_card) + if (abbrev_distinct > key_distinct * sss->prop_card) { /* * When we have exceeded 10,000 tuples, decay required cardinality @@ -2291,7 +2396,7 @@ bttext_abbrev_abort(int memtupcount, SortSupport ssup) * apparent it's probably not worth aborting. */ if (memtupcount > 10000) - tss->prop_card *= 0.65; + sss->prop_card *= 0.65; return false; } @@ -2309,9 +2414,9 @@ bttext_abbrev_abort(int memtupcount, SortSupport ssup) */ #ifdef TRACE_SORT if (trace_sort) - elog(LOG, "bttext_abbrev: aborted abbreviation at %d " + elog(LOG, "varstr_abbrev: aborted abbreviation at %d " "(abbrev_distinct: %f, key_distinct: %f, prop_card: %f)", - memtupcount, abbrev_distinct, key_distinct, tss->prop_card); + memtupcount, abbrev_distinct, key_distinct, sss->prop_card); #endif return true; @@ -2345,8 +2450,9 @@ text_smaller(PG_FUNCTION_ARGS) /* * The following operators support character-by-character comparison * of text datums, to allow building indexes suitable for LIKE clauses. - * Note that the regular texteq/textne comparison operators are assumed - * to be compatible with these! + * Note that the regular texteq/textne comparison operators, and regular + * support functions 1 and 2 with "C" collation are assumed to be + * compatible with these! */ static int @@ -2451,6 +2557,23 @@ bttext_pattern_cmp(PG_FUNCTION_ARGS) } +Datum +bttext_pattern_sortsupport(PG_FUNCTION_ARGS) +{ + SortSupport ssup = (SortSupport) PG_GETARG_POINTER(0); + MemoryContext oldcontext; + + oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt); + + /* Use generic string SortSupport, forcing "C" collation */ + varstr_sortsupport(ssup, C_COLLATION_OID, false); + + MemoryContextSwitchTo(oldcontext); + + PG_RETURN_VOID(); +} + + /*------------------------------------------------------------- * byteaoctetlen * @@ -3375,6 +3498,22 @@ byteacmp(PG_FUNCTION_ARGS) PG_RETURN_INT32(cmp); } +Datum +bytea_sortsupport(PG_FUNCTION_ARGS) +{ + SortSupport ssup = (SortSupport) PG_GETARG_POINTER(0); + MemoryContext oldcontext; + + oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt); + + /* Use generic string SortSupport, forcing "C" collation */ + varstr_sortsupport(ssup, C_COLLATION_OID, false); + + MemoryContextSwitchTo(oldcontext); + + PG_RETURN_VOID(); +} + /* * appendStringInfoText * diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h index 5c480b7d3ab..568c98f94e4 100644 --- a/src/include/catalog/catversion.h +++ b/src/include/catalog/catversion.h @@ -53,6 +53,6 @@ */ /* yyyymmddN */ -#define CATALOG_VERSION_NO 201601281 +#define CATALOG_VERSION_NO 201602031 #endif diff --git a/src/include/catalog/pg_amproc.h b/src/include/catalog/pg_amproc.h index e75da76b993..f0ae0087048 100644 --- a/src/include/catalog/pg_amproc.h +++ b/src/include/catalog/pg_amproc.h @@ -80,7 +80,9 @@ DATA(insert ( 421 702 702 1 357 )); DATA(insert ( 423 1560 1560 1 1596 )); DATA(insert ( 424 16 16 1 1693 )); DATA(insert ( 426 1042 1042 1 1078 )); +DATA(insert ( 426 1042 1042 2 3328 )); DATA(insert ( 428 17 17 1 1954 )); +DATA(insert ( 428 17 17 2 3331 )); DATA(insert ( 429 18 18 1 358 )); DATA(insert ( 434 1082 1082 1 1092 )); DATA(insert ( 434 1082 1082 2 3136 )); @@ -128,7 +130,9 @@ DATA(insert ( 1996 1083 1083 1 1107 )); DATA(insert ( 2000 1266 1266 1 1358 )); DATA(insert ( 2002 1562 1562 1 1672 )); DATA(insert ( 2095 25 25 1 2166 )); +DATA(insert ( 2095 25 25 2 3332 )); DATA(insert ( 2097 1042 1042 1 2180 )); +DATA(insert ( 2097 1042 1042 2 3333 )); DATA(insert ( 2099 790 790 1 377 )); DATA(insert ( 2233 703 703 1 380 )); DATA(insert ( 2234 704 704 1 381 )); diff --git a/src/include/catalog/pg_proc.h b/src/include/catalog/pg_proc.h index a2248b4679f..5ded13e2b01 100644 --- a/src/include/catalog/pg_proc.h +++ b/src/include/catalog/pg_proc.h @@ -1130,6 +1130,8 @@ DATA(insert OID = 1064 ( bpchar_smaller PGNSP PGUID 12 1 0 0 0 f f f f t f i DESCR("smaller of two"); DATA(insert OID = 1078 ( bpcharcmp PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 23 "1042 1042" _null_ _null_ _null_ _null_ _null_ bpcharcmp _null_ _null_ _null_ )); DESCR("less-equal-greater"); +DATA(insert OID = 3328 ( bpchar_sortsupport PGNSP PGUID 12 1 0 0 0 f f f f t f i s 1 0 2278 "2281" _null_ _null_ _null_ _null_ _null_ bpchar_sortsupport _null_ _null_ _null_ )); +DESCR("sort support"); DATA(insert OID = 1080 ( hashbpchar PGNSP PGUID 12 1 0 0 0 f f f f t f i s 1 0 23 "1042" _null_ _null_ _null_ _null_ _null_ hashbpchar _null_ _null_ _null_ )); DESCR("hash"); DATA(insert OID = 1081 ( format_type PGNSP PGUID 12 1 0 0 0 f f f f f f s s 2 0 25 "26 23" _null_ _null_ _null_ _null_ _null_ format_type _null_ _null_ _null_ )); @@ -2861,6 +2863,8 @@ DATA(insert OID = 1952 ( byteage PGNSP PGUID 12 1 0 0 0 f f f t t f i s 2 0 DATA(insert OID = 1953 ( byteane PGNSP PGUID 12 1 0 0 0 f f f t t f i s 2 0 16 "17 17" _null_ _null_ _null_ _null_ _null_ byteane _null_ _null_ _null_ )); DATA(insert OID = 1954 ( byteacmp PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 23 "17 17" _null_ _null_ _null_ _null_ _null_ byteacmp _null_ _null_ _null_ )); DESCR("less-equal-greater"); +DATA(insert OID = 3331 ( bytea_sortsupport PGNSP PGUID 12 1 0 0 0 f f f f t f i s 1 0 2278 "2281" _null_ _null_ _null_ _null_ _null_ bytea_sortsupport _null_ _null_ _null_ )); +DESCR("sort support"); DATA(insert OID = 3917 ( timestamp_transform PGNSP PGUID 12 1 0 0 0 f f f f t f i s 1 0 2281 "2281" _null_ _null_ _null_ _null_ _null_ timestamp_transform _null_ _null_ _null_ )); DESCR("transform a timestamp length coercion"); @@ -3373,6 +3377,8 @@ DATA(insert OID = 2163 ( text_pattern_ge PGNSP PGUID 12 1 0 0 0 f f f f t f i s DATA(insert OID = 2164 ( text_pattern_gt PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 16 "25 25" _null_ _null_ _null_ _null_ _null_ text_pattern_gt _null_ _null_ _null_ )); DATA(insert OID = 2166 ( bttext_pattern_cmp PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 23 "25 25" _null_ _null_ _null_ _null_ _null_ bttext_pattern_cmp _null_ _null_ _null_ )); DESCR("less-equal-greater"); +DATA(insert OID = 3332 ( bttext_pattern_sortsupport PGNSP PGUID 12 1 0 0 0 f f f f t f i s 1 0 2278 "2281" _null_ _null_ _null_ _null_ _null_ bttext_pattern_sortsupport _null_ _null_ _null_ )); +DESCR("sort support"); DATA(insert OID = 2174 ( bpchar_pattern_lt PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 16 "1042 1042" _null_ _null_ _null_ _null_ _null_ bpchar_pattern_lt _null_ _null_ _null_ )); DATA(insert OID = 2175 ( bpchar_pattern_le PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 16 "1042 1042" _null_ _null_ _null_ _null_ _null_ bpchar_pattern_le _null_ _null_ _null_ )); @@ -3380,6 +3386,8 @@ DATA(insert OID = 2177 ( bpchar_pattern_ge PGNSP PGUID 12 1 0 0 0 f f f f t f DATA(insert OID = 2178 ( bpchar_pattern_gt PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 16 "1042 1042" _null_ _null_ _null_ _null_ _null_ bpchar_pattern_gt _null_ _null_ _null_ )); DATA(insert OID = 2180 ( btbpchar_pattern_cmp PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 23 "1042 1042" _null_ _null_ _null_ _null_ _null_ btbpchar_pattern_cmp _null_ _null_ _null_ )); DESCR("less-equal-greater"); +DATA(insert OID = 3333 ( btbpchar_pattern_sortsupport PGNSP PGUID 12 1 0 0 0 f f f f t f i s 1 0 2278 "2281" _null_ _null_ _null_ _null_ _null_ btbpchar_pattern_sortsupport _null_ _null_ _null_ )); +DESCR("sort support"); DATA(insert OID = 2188 ( btint48cmp PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 23 "23 20" _null_ _null_ _null_ _null_ _null_ btint48cmp _null_ _null_ _null_ )); DESCR("less-equal-greater"); diff --git a/src/include/utils/builtins.h b/src/include/utils/builtins.h index c2e529fc6f6..5e8e8329b82 100644 --- a/src/include/utils/builtins.h +++ b/src/include/utils/builtins.h @@ -16,6 +16,7 @@ #include "fmgr.h" #include "nodes/parsenodes.h" +#include "utils/sortsupport.h" /* * Defined in adt/ @@ -761,8 +762,10 @@ extern Datum bpcharle(PG_FUNCTION_ARGS); extern Datum bpchargt(PG_FUNCTION_ARGS); extern Datum bpcharge(PG_FUNCTION_ARGS); extern Datum bpcharcmp(PG_FUNCTION_ARGS); +extern Datum bpchar_sortsupport(PG_FUNCTION_ARGS); extern Datum bpchar_larger(PG_FUNCTION_ARGS); extern Datum bpchar_smaller(PG_FUNCTION_ARGS); +extern int bpchartruelen(char *s, int len); extern Datum bpcharlen(PG_FUNCTION_ARGS); extern Datum bpcharoctetlen(PG_FUNCTION_ARGS); extern Datum hashbpchar(PG_FUNCTION_ARGS); @@ -771,6 +774,7 @@ extern Datum bpchar_pattern_le(PG_FUNCTION_ARGS); extern Datum bpchar_pattern_gt(PG_FUNCTION_ARGS); extern Datum bpchar_pattern_ge(PG_FUNCTION_ARGS); extern Datum btbpchar_pattern_cmp(PG_FUNCTION_ARGS); +extern Datum btbpchar_pattern_sortsupport(PG_FUNCTION_ARGS); extern Datum varcharin(PG_FUNCTION_ARGS); extern Datum varcharout(PG_FUNCTION_ARGS); @@ -808,6 +812,7 @@ extern Datum text_pattern_le(PG_FUNCTION_ARGS); extern Datum text_pattern_gt(PG_FUNCTION_ARGS); extern Datum text_pattern_ge(PG_FUNCTION_ARGS); extern Datum bttext_pattern_cmp(PG_FUNCTION_ARGS); +extern Datum bttext_pattern_sortsupport(PG_FUNCTION_ARGS); extern Datum textlen(PG_FUNCTION_ARGS); extern Datum textoctetlen(PG_FUNCTION_ARGS); extern Datum textpos(PG_FUNCTION_ARGS); @@ -818,6 +823,7 @@ extern Datum textoverlay_no_len(PG_FUNCTION_ARGS); extern Datum name_text(PG_FUNCTION_ARGS); extern Datum text_name(PG_FUNCTION_ARGS); extern int varstr_cmp(char *arg1, int len1, char *arg2, int len2, Oid collid); +extern void varstr_sortsupport(SortSupport ssup, Oid collid, bool bpchar); extern int varstr_levenshtein(const char *source, int slen, const char *target, int tlen, int ins_c, int del_c, int sub_c, diff --git a/src/include/utils/bytea.h b/src/include/utils/bytea.h index 40147fbaa8b..c41e6b4b7a5 100644 --- a/src/include/utils/bytea.h +++ b/src/include/utils/bytea.h @@ -42,6 +42,7 @@ extern Datum byteale(PG_FUNCTION_ARGS); extern Datum byteagt(PG_FUNCTION_ARGS); extern Datum byteage(PG_FUNCTION_ARGS); extern Datum byteacmp(PG_FUNCTION_ARGS); +extern Datum bytea_sortsupport(PG_FUNCTION_ARGS); extern Datum byteacat(PG_FUNCTION_ARGS); extern Datum byteapos(PG_FUNCTION_ARGS); extern Datum bytea_substr(PG_FUNCTION_ARGS);