Separate out bytea sort support from varlena.c

In the wake of commit b45242fd3, bytea_sortsupport() still called out to varstr_sortsupport(). Treating bytea as a kind of text/varchar required varstr_sortsupport() to allow for the possibility of NUL bytes, but only for C collation. This was confusing. For better separation of concerns, create an independent sortsupport implementation in bytea.c. The heuristics for bytea_abbrev_abort() remain the same as for varstr_abbrev_abort(). It's possible that the bytea case warrants different treatment, but that is left for future investigation. In passing, adjust some strange looking comparisons in varstr_abbrev_abort(). Author: Aleksander Alekseev <aleksander@tigerdata.com> Reviewed-by: John Naylor <johncnaylorls@gmail.com> Reviewed-by: Chao Li <li.evan.chao@gmail.com> Discussion: https://postgr.es/m/CAJ7c6TP1bAbEhUJa6+rgceN6QJWMSsxhg1=mqfSN=Nb-n6DAKg@mail.gmail.com
2025-12-18 05:01:01 +03:00 · 2025-12-16 15:19:16 +07:00
parent 15f68cebdc
commit 9303d62c6d
3 changed files with 242 additions and 40 deletions
--- a/src/backend/utils/adt/bytea.c
+++ b/src/backend/utils/adt/bytea.c
@@ -15,18 +15,19 @@
 #include "postgres.h"
 #include "access/detoast.h"
-#include "catalog/pg_collation_d.h"
+#include "common/hashfn.h"
 #include "catalog/pg_type_d.h"
 #include "common/int.h"
 #include "fmgr.h"
 #include "lib/hyperloglog.h"
 #include "libpq/pqformat.h"
 #include "port/pg_bitutils.h"
 #include "port/pg_bswap.h"
 #include "utils/builtins.h"
 #include "utils/bytea.h"
 #include "utils/fmgrprotos.h"
 #include "utils/guc.h"
 #include "utils/memutils.h"
 #include "utils/sortsupport.h"
 #include "utils/varlena.h"
 #include "varatt.h"
 /* GUC variable */
@@ -37,6 +38,19 @@ static bytea *bytea_substring(Datum str, int S, int L,
 							  bool length_not_specified);
 static bytea *bytea_overlay(bytea *t1, bytea *t2, int sp, int sl);
 typedef struct
 {
 	bool		abbreviate;		/* Should we abbreviate keys? */
 	hyperLogLogState abbr_card; /* Abbreviated key cardinality state */
 	hyperLogLogState full_card; /* Full key cardinality state */
 	double		prop_card;		/* Required cardinality proportion */
 } ByteaSortSupport;
 /* Static function declarations for sort support */
 static int	byteafastcmp(Datum x, Datum y, SortSupport ssup);
 static Datum bytea_abbrev_convert(Datum original, SortSupport ssup);
 static bool bytea_abbrev_abort(int memtupcount, SortSupport ssup);
 /*
 * bytea_catenate
 *	Guts of byteacat(), broken out so it can be used by other functions
@@ -1001,6 +1015,201 @@ bytea_smaller(PG_FUNCTION_ARGS)
 	PG_RETURN_BYTEA_P(result);
 }
 /*
 * sortsupport comparison func
 */
 static int
 byteafastcmp(Datum x, Datum y, SortSupport ssup)
 {
 	bytea	   *arg1 = DatumGetByteaPP(x);
 	bytea	   *arg2 = DatumGetByteaPP(y);
 	char	   *a1p,
 			   *a2p;
 	int			len1,
 				len2,
 				result;
 	a1p = VARDATA_ANY(arg1);
 	a2p = VARDATA_ANY(arg2);
 	len1 = VARSIZE_ANY_EXHDR(arg1);
 	len2 = VARSIZE_ANY_EXHDR(arg2);
 	result = memcmp(a1p, a2p, Min(len1, len2));
 	if ((result == 0) && (len1 != len2))
 		result = (len1 < len2) ? -1 : 1;
 	/* We can't afford to leak memory here. */
 	if (PointerGetDatum(arg1) != x)
 		pfree(arg1);
 	if (PointerGetDatum(arg2) != y)
 		pfree(arg2);
 	return result;
 }
 /*
 * Conversion routine for sortsupport.  Converts original to abbreviated key
 * representation.  Our encoding strategy is simple -- pack the first 8 bytes
 * of the bytea data into a Datum (on little-endian machines, the bytes are
 * stored in reverse order), and treat it as an unsigned integer.
 */
 static Datum
 bytea_abbrev_convert(Datum original, SortSupport ssup)
 {
 	const size_t max_prefix_bytes = sizeof(Datum);
 	ByteaSortSupport *bss = (ByteaSortSupport *) ssup->ssup_extra;
 	bytea	   *authoritative = DatumGetByteaPP(original);
 	char	   *authoritative_data = VARDATA_ANY(authoritative);
 	Datum		res;
 	char	   *pres;
 	int			len;
 	uint32		hash;
 	pres = (char *) &res;
 	/* memset(), so any non-overwritten bytes are NUL */
 	memset(pres, 0, max_prefix_bytes);
 	len = VARSIZE_ANY_EXHDR(authoritative);
 	/*
 	 * Short byteas will have terminating NUL bytes in the abbreviated datum.
 	 * Abbreviated comparison need not make a distinction between these NUL
 	 * bytes, and NUL bytes representing actual NULs in the authoritative
 	 * representation.
 	 *
 	 * Hopefully a comparison at or past one abbreviated key's terminating NUL
 	 * byte will resolve the comparison without consulting the authoritative
 	 * representation; specifically, some later non-NUL byte in the longer
 	 * bytea can resolve the comparison against a subsequent terminating NUL
 	 * in the shorter bytea.  There will usually be what is effectively a
 	 * "length-wise" resolution there and then.
 	 *
 	 * If that doesn't work out -- if all bytes in the longer bytea positioned
 	 * at or past the offset of the smaller bytea (first) terminating NUL are
 	 * actually representative of NUL bytes in the authoritative binary bytea
 	 * (perhaps with some *terminating* NUL bytes towards the end of the
 	 * longer bytea iff it happens to still be small) -- then an authoritative
 	 * tie-breaker will happen, and do the right thing: explicitly consider
 	 * bytea length.
 	 */
 	memcpy(pres, authoritative_data, Min(len, max_prefix_bytes));
 	/*
 	 * Maintain approximate cardinality of both abbreviated keys and original,
 	 * authoritative keys using HyperLogLog.  Used as cheap insurance against
 	 * the worst case, where we do many string abbreviations for no saving in
 	 * full memcmp()-based comparisons.  These statistics are used by
 	 * bytea_abbrev_abort().
 	 *
 	 * First, Hash key proper, or a significant fraction of it.  Mix in length
 	 * in order to compensate for cases where differences are past
 	 * PG_CACHE_LINE_SIZE bytes, so as to limit the overhead of hashing.
 	 */
 	hash = DatumGetUInt32(hash_any((unsigned char *) authoritative_data,
 								   Min(len, PG_CACHE_LINE_SIZE)));
 	if (len > PG_CACHE_LINE_SIZE)
 		hash ^= DatumGetUInt32(hash_uint32((uint32) len));
 	addHyperLogLog(&bss->full_card, hash);
 	/* Hash abbreviated key */
 	{
 		uint32		tmp;
 		tmp = DatumGetUInt32(res) ^ (uint32) (DatumGetUInt64(res) >> 32);
 		hash = DatumGetUInt32(hash_uint32(tmp));
 	}
 	addHyperLogLog(&bss->abbr_card, hash);
 	/*
 	 * Byteswap on little-endian machines.
 	 *
 	 * This is needed so that ssup_datum_unsigned_cmp() works correctly on all
 	 * platforms.
 	 */
 	res = DatumBigEndianToNative(res);
 	/* Don't leak memory here */
 	if (PointerGetDatum(authoritative) != original)
 		pfree(authoritative);
 	return res;
 }
 /*
 * Callback for estimating effectiveness of abbreviated key optimization, using
 * heuristic rules.  Returns value indicating if the abbreviation optimization
 * should be aborted, based on its projected effectiveness.
 *
 * This is based on varstr_abbrev_abort(), but some comments have been elided
 * for brevity. See there for more details.
 */
 static bool
 bytea_abbrev_abort(int memtupcount, SortSupport ssup)
 {
 	ByteaSortSupport *bss = (ByteaSortSupport *) ssup->ssup_extra;
 	double		abbrev_distinct,
 				key_distinct;
 	Assert(ssup->abbreviate);
 	/* Have a little patience */
 	if (memtupcount < 100)
 		return false;
 	abbrev_distinct = estimateHyperLogLog(&bss->abbr_card);
 	key_distinct = estimateHyperLogLog(&bss->full_card);
 	/*
 	 * Clamp cardinality estimates to at least one distinct value.  While
 	 * NULLs are generally disregarded, if only NULL values were seen so far,
 	 * that might misrepresent costs if we failed to clamp.
 	 */
 	if (abbrev_distinct < 1.0)
 		abbrev_distinct = 1.0;
 	if (key_distinct < 1.0)
 		key_distinct = 1.0;
 	if (trace_sort)
 	{
 		double		norm_abbrev_card = abbrev_distinct / (double) memtupcount;
 		elog(LOG, "bytea_abbrev: abbrev_distinct after %d: %f "
 			 "(key_distinct: %f, norm_abbrev_card: %f, prop_card: %f)",
 			 memtupcount, abbrev_distinct, key_distinct, norm_abbrev_card,
 			 bss->prop_card);
 	}
 	/*
 	 * If the number of distinct abbreviated keys approximately matches the
 	 * number of distinct original keys, continue with abbreviation.
 	 */
 	if (abbrev_distinct > key_distinct * bss->prop_card)
 	{
 		/*
 		 * Decay required cardinality aggressively after 10,000 tuples.
 		 */
 		if (memtupcount > 10000)
 			bss->prop_card *= 0.65;
 		return false;
 	}
 	/*
 	 * Abort abbreviation strategy.
 	 */
 	if (trace_sort)
 		elog(LOG, "bytea_abbrev: aborted abbreviation at %d "
 			 "(abbrev_distinct: %f, key_distinct: %f, prop_card: %f)",
 			 memtupcount, abbrev_distinct, key_distinct, bss->prop_card);
 	return true;
 }
 Datum
 bytea_sortsupport(PG_FUNCTION_ARGS)
 {
@@ -1009,8 +1218,27 @@ bytea_sortsupport(PG_FUNCTION_ARGS)
 	oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt);
-	/* Use generic string SortSupport, forcing "C" collation */
+	ssup->comparator = byteafastcmp;
-	varstr_sortsupport(ssup, BYTEAOID, C_COLLATION_OID);
+
 	/*
 	 * Set up abbreviation support if requested.
 	 */
 	if (ssup->abbreviate)
 	{
 		ByteaSortSupport *bss;
 		bss = palloc_object(ByteaSortSupport);
 		bss->abbreviate = true;
 		bss->prop_card = 0.20;
 		initHyperLogLog(&bss->abbr_card, 10);
 		initHyperLogLog(&bss->full_card, 10);
 		ssup->ssup_extra = bss;
 		ssup->abbrev_full_comparator = ssup->comparator;
 		ssup->comparator = ssup_datum_unsigned_cmp;
 		ssup->abbrev_converter = bytea_abbrev_convert;
 		ssup->abbrev_abort = bytea_abbrev_abort;
 	}
 	MemoryContextSwitchTo(oldcontext);
--- a/src/backend/utils/adt/varlena.c
+++ b/src/backend/utils/adt/varlena.c
@@ -92,7 +92,7 @@ typedef struct
 	int			last_returned;	/* Last comparison result (cache) */
 	bool		cache_blob;		/* Does buf2 contain strxfrm() blob, etc? */
 	bool		collate_c;
-	Oid			typid;			/* Actual datatype (text/bpchar/bytea/name) */
+	Oid			typid;			/* Actual datatype (text/bpchar/name) */
 	hyperLogLogState abbr_card; /* Abbreviated key cardinality state */
 	hyperLogLogState full_card; /* Full key cardinality state */
 	double		prop_card;		/* Required cardinality proportion */
@@ -1617,10 +1617,8 @@ bttextsortsupport(PG_FUNCTION_ARGS)
 * Includes locale support, and support for BpChar semantics (i.e. removing
 * trailing spaces before comparison).
 *
- * Relies on the assumption that text, VarChar, BpChar, and bytea all have the
+ * Relies on the assumption that text, VarChar, and BpChar all have the
- * same representation.  Callers that always use the C collation (e.g.
+ * same representation.
 * non-collatable type callers like bytea) may have NUL bytes in their strings;
 * this will not work with any other collation, though.
 */
 void
 varstr_sortsupport(SortSupport ssup, Oid typid, Oid collid)
@@ -1983,7 +1981,7 @@ varstrfastcmp_locale(char *a1p, int len1, char *a2p, int len2, SortSupport ssup)
 * representation.  Our encoding strategy is simple -- pack the first 8 bytes
 * of a strxfrm() blob into a Datum (on little-endian machines, the 8 bytes are
 * stored in reverse order), and treat it as an unsigned integer.  When the "C"
- * locale is used, or in case of bytea, just memcpy() from original instead.
+ * locale is used just memcpy() from original instead.
 */
 static Datum
 varstr_abbrev_convert(Datum original, SortSupport ssup)
@@ -2010,30 +2008,8 @@ varstr_abbrev_convert(Datum original, SortSupport ssup)
 	/*
 	 * If we're using the C collation, use memcpy(), rather than strxfrm(), to
-	 * abbreviate keys.  The full comparator for the C locale is always
+	 * abbreviate keys.  The full comparator for the C locale is also
-	 * memcmp().  It would be incorrect to allow bytea callers (callers that
+	 * memcmp().  This should be faster than strxfrm().
 	 * always force the C collation -- bytea isn't a collatable type, but this
 	 * approach is convenient) to use strxfrm().  This is because bytea
 	 * strings may contain NUL bytes.  Besides, this should be faster, too.
 	 *
 	 * More generally, it's okay that bytea callers can have NUL bytes in
 	 * strings because abbreviated cmp need not make a distinction between
 	 * terminating NUL bytes, and NUL bytes representing actual NULs in the
 	 * authoritative representation.  Hopefully a comparison at or past one
 	 * abbreviated key's terminating NUL byte will resolve the comparison
 	 * without consulting the authoritative representation; specifically, some
 	 * later non-NUL byte in the longer string can resolve the comparison
 	 * against a subsequent terminating NUL in the shorter string.  There will
 	 * usually be what is effectively a "length-wise" resolution there and
 	 * then.
 	 *
 	 * If that doesn't work out -- if all bytes in the longer string
 	 * positioned at or past the offset of the smaller string's (first)
 	 * terminating NUL are actually representative of NUL bytes in the
 	 * authoritative binary string (perhaps with some *terminating* NUL bytes
 	 * towards the end of the longer string iff it happens to still be small)
 	 * -- then an authoritative tie-breaker will happen, and do the right
 	 * thing: explicitly consider string length.
 	 */
 	if (sss->collate_c)
 		memcpy(pres, authoritative_data, Min(len, max_prefix_bytes));
@@ -2115,9 +2091,6 @@ varstr_abbrev_convert(Datum original, SortSupport ssup)
 		 * strxfrm() blob is itself NUL terminated, leaving no danger of
 		 * misinterpreting any NUL bytes not intended to be interpreted as
 		 * logically representing termination.
 		 *
 		 * (Actually, even if there were NUL bytes in the blob it would be
 		 * okay.  See remarks on bytea case above.)
 		 */
 		memcpy(pres, sss->buf2, Min(max_prefix_bytes, bsize));
 	}
@@ -2198,10 +2171,10 @@ varstr_abbrev_abort(int memtupcount, SortSupport ssup)
 	 * NULLs are generally disregarded, if only NULL values were seen so far,
 	 * that might misrepresent costs if we failed to clamp.
 	 */
-	if (abbrev_distinct <= 1.0)
+	if (abbrev_distinct < 1.0)
 		abbrev_distinct = 1.0;
-	if (key_distinct <= 1.0)
+	if (key_distinct < 1.0)
 		key_distinct = 1.0;
 	/*
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -366,6 +366,7 @@ BulkWriteBuffer
 BulkWriteState
 BumpBlock
 BumpContext
 ByteaSortSupport
 CACHESIGN
 CAC_state
 CCFastEqualFN