mirror of
https://github.com/postgres/postgres.git
synced 2025-12-19 17:02:53 +03:00
Separate out bytea sort support from varlena.c
In the wake of commit b45242fd3, bytea_sortsupport() still called out
to varstr_sortsupport(). Treating bytea as a kind of text/varchar
required varstr_sortsupport() to allow for the possibility of
NUL bytes, but only for C collation. This was confusing. For
better separation of concerns, create an independent sortsupport
implementation in bytea.c.
The heuristics for bytea_abbrev_abort() remain the same as for
varstr_abbrev_abort(). It's possible that the bytea case warrants
different treatment, but that is left for future investigation.
In passing, adjust some strange looking comparisons in
varstr_abbrev_abort().
Author: Aleksander Alekseev <aleksander@tigerdata.com>
Reviewed-by: John Naylor <johncnaylorls@gmail.com>
Reviewed-by: Chao Li <li.evan.chao@gmail.com>
Discussion: https://postgr.es/m/CAJ7c6TP1bAbEhUJa6+rgceN6QJWMSsxhg1=mqfSN=Nb-n6DAKg@mail.gmail.com
This commit is contained in:
@@ -15,18 +15,19 @@
|
|||||||
#include "postgres.h"
|
#include "postgres.h"
|
||||||
|
|
||||||
#include "access/detoast.h"
|
#include "access/detoast.h"
|
||||||
#include "catalog/pg_collation_d.h"
|
#include "common/hashfn.h"
|
||||||
#include "catalog/pg_type_d.h"
|
|
||||||
#include "common/int.h"
|
#include "common/int.h"
|
||||||
#include "fmgr.h"
|
#include "fmgr.h"
|
||||||
|
#include "lib/hyperloglog.h"
|
||||||
#include "libpq/pqformat.h"
|
#include "libpq/pqformat.h"
|
||||||
#include "port/pg_bitutils.h"
|
#include "port/pg_bitutils.h"
|
||||||
|
#include "port/pg_bswap.h"
|
||||||
#include "utils/builtins.h"
|
#include "utils/builtins.h"
|
||||||
#include "utils/bytea.h"
|
#include "utils/bytea.h"
|
||||||
#include "utils/fmgrprotos.h"
|
#include "utils/fmgrprotos.h"
|
||||||
|
#include "utils/guc.h"
|
||||||
#include "utils/memutils.h"
|
#include "utils/memutils.h"
|
||||||
#include "utils/sortsupport.h"
|
#include "utils/sortsupport.h"
|
||||||
#include "utils/varlena.h"
|
|
||||||
#include "varatt.h"
|
#include "varatt.h"
|
||||||
|
|
||||||
/* GUC variable */
|
/* GUC variable */
|
||||||
@@ -37,6 +38,19 @@ static bytea *bytea_substring(Datum str, int S, int L,
|
|||||||
bool length_not_specified);
|
bool length_not_specified);
|
||||||
static bytea *bytea_overlay(bytea *t1, bytea *t2, int sp, int sl);
|
static bytea *bytea_overlay(bytea *t1, bytea *t2, int sp, int sl);
|
||||||
|
|
||||||
|
typedef struct
|
||||||
|
{
|
||||||
|
bool abbreviate; /* Should we abbreviate keys? */
|
||||||
|
hyperLogLogState abbr_card; /* Abbreviated key cardinality state */
|
||||||
|
hyperLogLogState full_card; /* Full key cardinality state */
|
||||||
|
double prop_card; /* Required cardinality proportion */
|
||||||
|
} ByteaSortSupport;
|
||||||
|
|
||||||
|
/* Static function declarations for sort support */
|
||||||
|
static int byteafastcmp(Datum x, Datum y, SortSupport ssup);
|
||||||
|
static Datum bytea_abbrev_convert(Datum original, SortSupport ssup);
|
||||||
|
static bool bytea_abbrev_abort(int memtupcount, SortSupport ssup);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* bytea_catenate
|
* bytea_catenate
|
||||||
* Guts of byteacat(), broken out so it can be used by other functions
|
* Guts of byteacat(), broken out so it can be used by other functions
|
||||||
@@ -1001,6 +1015,201 @@ bytea_smaller(PG_FUNCTION_ARGS)
|
|||||||
PG_RETURN_BYTEA_P(result);
|
PG_RETURN_BYTEA_P(result);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* sortsupport comparison func
|
||||||
|
*/
|
||||||
|
static int
|
||||||
|
byteafastcmp(Datum x, Datum y, SortSupport ssup)
|
||||||
|
{
|
||||||
|
bytea *arg1 = DatumGetByteaPP(x);
|
||||||
|
bytea *arg2 = DatumGetByteaPP(y);
|
||||||
|
char *a1p,
|
||||||
|
*a2p;
|
||||||
|
int len1,
|
||||||
|
len2,
|
||||||
|
result;
|
||||||
|
|
||||||
|
a1p = VARDATA_ANY(arg1);
|
||||||
|
a2p = VARDATA_ANY(arg2);
|
||||||
|
|
||||||
|
len1 = VARSIZE_ANY_EXHDR(arg1);
|
||||||
|
len2 = VARSIZE_ANY_EXHDR(arg2);
|
||||||
|
|
||||||
|
result = memcmp(a1p, a2p, Min(len1, len2));
|
||||||
|
if ((result == 0) && (len1 != len2))
|
||||||
|
result = (len1 < len2) ? -1 : 1;
|
||||||
|
|
||||||
|
/* We can't afford to leak memory here. */
|
||||||
|
if (PointerGetDatum(arg1) != x)
|
||||||
|
pfree(arg1);
|
||||||
|
if (PointerGetDatum(arg2) != y)
|
||||||
|
pfree(arg2);
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Conversion routine for sortsupport. Converts original to abbreviated key
|
||||||
|
* representation. Our encoding strategy is simple -- pack the first 8 bytes
|
||||||
|
* of the bytea data into a Datum (on little-endian machines, the bytes are
|
||||||
|
* stored in reverse order), and treat it as an unsigned integer.
|
||||||
|
*/
|
||||||
|
static Datum
|
||||||
|
bytea_abbrev_convert(Datum original, SortSupport ssup)
|
||||||
|
{
|
||||||
|
const size_t max_prefix_bytes = sizeof(Datum);
|
||||||
|
ByteaSortSupport *bss = (ByteaSortSupport *) ssup->ssup_extra;
|
||||||
|
bytea *authoritative = DatumGetByteaPP(original);
|
||||||
|
char *authoritative_data = VARDATA_ANY(authoritative);
|
||||||
|
Datum res;
|
||||||
|
char *pres;
|
||||||
|
int len;
|
||||||
|
uint32 hash;
|
||||||
|
|
||||||
|
pres = (char *) &res;
|
||||||
|
|
||||||
|
/* memset(), so any non-overwritten bytes are NUL */
|
||||||
|
memset(pres, 0, max_prefix_bytes);
|
||||||
|
len = VARSIZE_ANY_EXHDR(authoritative);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Short byteas will have terminating NUL bytes in the abbreviated datum.
|
||||||
|
* Abbreviated comparison need not make a distinction between these NUL
|
||||||
|
* bytes, and NUL bytes representing actual NULs in the authoritative
|
||||||
|
* representation.
|
||||||
|
*
|
||||||
|
* Hopefully a comparison at or past one abbreviated key's terminating NUL
|
||||||
|
* byte will resolve the comparison without consulting the authoritative
|
||||||
|
* representation; specifically, some later non-NUL byte in the longer
|
||||||
|
* bytea can resolve the comparison against a subsequent terminating NUL
|
||||||
|
* in the shorter bytea. There will usually be what is effectively a
|
||||||
|
* "length-wise" resolution there and then.
|
||||||
|
*
|
||||||
|
* If that doesn't work out -- if all bytes in the longer bytea positioned
|
||||||
|
* at or past the offset of the smaller bytea (first) terminating NUL are
|
||||||
|
* actually representative of NUL bytes in the authoritative binary bytea
|
||||||
|
* (perhaps with some *terminating* NUL bytes towards the end of the
|
||||||
|
* longer bytea iff it happens to still be small) -- then an authoritative
|
||||||
|
* tie-breaker will happen, and do the right thing: explicitly consider
|
||||||
|
* bytea length.
|
||||||
|
*/
|
||||||
|
memcpy(pres, authoritative_data, Min(len, max_prefix_bytes));
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Maintain approximate cardinality of both abbreviated keys and original,
|
||||||
|
* authoritative keys using HyperLogLog. Used as cheap insurance against
|
||||||
|
* the worst case, where we do many string abbreviations for no saving in
|
||||||
|
* full memcmp()-based comparisons. These statistics are used by
|
||||||
|
* bytea_abbrev_abort().
|
||||||
|
*
|
||||||
|
* First, Hash key proper, or a significant fraction of it. Mix in length
|
||||||
|
* in order to compensate for cases where differences are past
|
||||||
|
* PG_CACHE_LINE_SIZE bytes, so as to limit the overhead of hashing.
|
||||||
|
*/
|
||||||
|
hash = DatumGetUInt32(hash_any((unsigned char *) authoritative_data,
|
||||||
|
Min(len, PG_CACHE_LINE_SIZE)));
|
||||||
|
|
||||||
|
if (len > PG_CACHE_LINE_SIZE)
|
||||||
|
hash ^= DatumGetUInt32(hash_uint32((uint32) len));
|
||||||
|
|
||||||
|
addHyperLogLog(&bss->full_card, hash);
|
||||||
|
|
||||||
|
/* Hash abbreviated key */
|
||||||
|
{
|
||||||
|
uint32 tmp;
|
||||||
|
|
||||||
|
tmp = DatumGetUInt32(res) ^ (uint32) (DatumGetUInt64(res) >> 32);
|
||||||
|
hash = DatumGetUInt32(hash_uint32(tmp));
|
||||||
|
}
|
||||||
|
|
||||||
|
addHyperLogLog(&bss->abbr_card, hash);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Byteswap on little-endian machines.
|
||||||
|
*
|
||||||
|
* This is needed so that ssup_datum_unsigned_cmp() works correctly on all
|
||||||
|
* platforms.
|
||||||
|
*/
|
||||||
|
res = DatumBigEndianToNative(res);
|
||||||
|
|
||||||
|
/* Don't leak memory here */
|
||||||
|
if (PointerGetDatum(authoritative) != original)
|
||||||
|
pfree(authoritative);
|
||||||
|
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Callback for estimating effectiveness of abbreviated key optimization, using
|
||||||
|
* heuristic rules. Returns value indicating if the abbreviation optimization
|
||||||
|
* should be aborted, based on its projected effectiveness.
|
||||||
|
*
|
||||||
|
* This is based on varstr_abbrev_abort(), but some comments have been elided
|
||||||
|
* for brevity. See there for more details.
|
||||||
|
*/
|
||||||
|
static bool
|
||||||
|
bytea_abbrev_abort(int memtupcount, SortSupport ssup)
|
||||||
|
{
|
||||||
|
ByteaSortSupport *bss = (ByteaSortSupport *) ssup->ssup_extra;
|
||||||
|
double abbrev_distinct,
|
||||||
|
key_distinct;
|
||||||
|
|
||||||
|
Assert(ssup->abbreviate);
|
||||||
|
|
||||||
|
/* Have a little patience */
|
||||||
|
if (memtupcount < 100)
|
||||||
|
return false;
|
||||||
|
|
||||||
|
abbrev_distinct = estimateHyperLogLog(&bss->abbr_card);
|
||||||
|
key_distinct = estimateHyperLogLog(&bss->full_card);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Clamp cardinality estimates to at least one distinct value. While
|
||||||
|
* NULLs are generally disregarded, if only NULL values were seen so far,
|
||||||
|
* that might misrepresent costs if we failed to clamp.
|
||||||
|
*/
|
||||||
|
if (abbrev_distinct < 1.0)
|
||||||
|
abbrev_distinct = 1.0;
|
||||||
|
|
||||||
|
if (key_distinct < 1.0)
|
||||||
|
key_distinct = 1.0;
|
||||||
|
|
||||||
|
if (trace_sort)
|
||||||
|
{
|
||||||
|
double norm_abbrev_card = abbrev_distinct / (double) memtupcount;
|
||||||
|
|
||||||
|
elog(LOG, "bytea_abbrev: abbrev_distinct after %d: %f "
|
||||||
|
"(key_distinct: %f, norm_abbrev_card: %f, prop_card: %f)",
|
||||||
|
memtupcount, abbrev_distinct, key_distinct, norm_abbrev_card,
|
||||||
|
bss->prop_card);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* If the number of distinct abbreviated keys approximately matches the
|
||||||
|
* number of distinct original keys, continue with abbreviation.
|
||||||
|
*/
|
||||||
|
if (abbrev_distinct > key_distinct * bss->prop_card)
|
||||||
|
{
|
||||||
|
/*
|
||||||
|
* Decay required cardinality aggressively after 10,000 tuples.
|
||||||
|
*/
|
||||||
|
if (memtupcount > 10000)
|
||||||
|
bss->prop_card *= 0.65;
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Abort abbreviation strategy.
|
||||||
|
*/
|
||||||
|
if (trace_sort)
|
||||||
|
elog(LOG, "bytea_abbrev: aborted abbreviation at %d "
|
||||||
|
"(abbrev_distinct: %f, key_distinct: %f, prop_card: %f)",
|
||||||
|
memtupcount, abbrev_distinct, key_distinct, bss->prop_card);
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
Datum
|
Datum
|
||||||
bytea_sortsupport(PG_FUNCTION_ARGS)
|
bytea_sortsupport(PG_FUNCTION_ARGS)
|
||||||
{
|
{
|
||||||
@@ -1009,8 +1218,27 @@ bytea_sortsupport(PG_FUNCTION_ARGS)
|
|||||||
|
|
||||||
oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt);
|
oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt);
|
||||||
|
|
||||||
/* Use generic string SortSupport, forcing "C" collation */
|
ssup->comparator = byteafastcmp;
|
||||||
varstr_sortsupport(ssup, BYTEAOID, C_COLLATION_OID);
|
|
||||||
|
/*
|
||||||
|
* Set up abbreviation support if requested.
|
||||||
|
*/
|
||||||
|
if (ssup->abbreviate)
|
||||||
|
{
|
||||||
|
ByteaSortSupport *bss;
|
||||||
|
|
||||||
|
bss = palloc_object(ByteaSortSupport);
|
||||||
|
bss->abbreviate = true;
|
||||||
|
bss->prop_card = 0.20;
|
||||||
|
initHyperLogLog(&bss->abbr_card, 10);
|
||||||
|
initHyperLogLog(&bss->full_card, 10);
|
||||||
|
|
||||||
|
ssup->ssup_extra = bss;
|
||||||
|
ssup->abbrev_full_comparator = ssup->comparator;
|
||||||
|
ssup->comparator = ssup_datum_unsigned_cmp;
|
||||||
|
ssup->abbrev_converter = bytea_abbrev_convert;
|
||||||
|
ssup->abbrev_abort = bytea_abbrev_abort;
|
||||||
|
}
|
||||||
|
|
||||||
MemoryContextSwitchTo(oldcontext);
|
MemoryContextSwitchTo(oldcontext);
|
||||||
|
|
||||||
|
|||||||
@@ -92,7 +92,7 @@ typedef struct
|
|||||||
int last_returned; /* Last comparison result (cache) */
|
int last_returned; /* Last comparison result (cache) */
|
||||||
bool cache_blob; /* Does buf2 contain strxfrm() blob, etc? */
|
bool cache_blob; /* Does buf2 contain strxfrm() blob, etc? */
|
||||||
bool collate_c;
|
bool collate_c;
|
||||||
Oid typid; /* Actual datatype (text/bpchar/bytea/name) */
|
Oid typid; /* Actual datatype (text/bpchar/name) */
|
||||||
hyperLogLogState abbr_card; /* Abbreviated key cardinality state */
|
hyperLogLogState abbr_card; /* Abbreviated key cardinality state */
|
||||||
hyperLogLogState full_card; /* Full key cardinality state */
|
hyperLogLogState full_card; /* Full key cardinality state */
|
||||||
double prop_card; /* Required cardinality proportion */
|
double prop_card; /* Required cardinality proportion */
|
||||||
@@ -1617,10 +1617,8 @@ bttextsortsupport(PG_FUNCTION_ARGS)
|
|||||||
* Includes locale support, and support for BpChar semantics (i.e. removing
|
* Includes locale support, and support for BpChar semantics (i.e. removing
|
||||||
* trailing spaces before comparison).
|
* trailing spaces before comparison).
|
||||||
*
|
*
|
||||||
* Relies on the assumption that text, VarChar, BpChar, and bytea all have the
|
* Relies on the assumption that text, VarChar, and BpChar all have the
|
||||||
* same representation. Callers that always use the C collation (e.g.
|
* same representation.
|
||||||
* non-collatable type callers like bytea) may have NUL bytes in their strings;
|
|
||||||
* this will not work with any other collation, though.
|
|
||||||
*/
|
*/
|
||||||
void
|
void
|
||||||
varstr_sortsupport(SortSupport ssup, Oid typid, Oid collid)
|
varstr_sortsupport(SortSupport ssup, Oid typid, Oid collid)
|
||||||
@@ -1983,7 +1981,7 @@ varstrfastcmp_locale(char *a1p, int len1, char *a2p, int len2, SortSupport ssup)
|
|||||||
* representation. Our encoding strategy is simple -- pack the first 8 bytes
|
* representation. Our encoding strategy is simple -- pack the first 8 bytes
|
||||||
* of a strxfrm() blob into a Datum (on little-endian machines, the 8 bytes are
|
* of a strxfrm() blob into a Datum (on little-endian machines, the 8 bytes are
|
||||||
* stored in reverse order), and treat it as an unsigned integer. When the "C"
|
* stored in reverse order), and treat it as an unsigned integer. When the "C"
|
||||||
* locale is used, or in case of bytea, just memcpy() from original instead.
|
* locale is used just memcpy() from original instead.
|
||||||
*/
|
*/
|
||||||
static Datum
|
static Datum
|
||||||
varstr_abbrev_convert(Datum original, SortSupport ssup)
|
varstr_abbrev_convert(Datum original, SortSupport ssup)
|
||||||
@@ -2010,30 +2008,8 @@ varstr_abbrev_convert(Datum original, SortSupport ssup)
|
|||||||
|
|
||||||
/*
|
/*
|
||||||
* If we're using the C collation, use memcpy(), rather than strxfrm(), to
|
* If we're using the C collation, use memcpy(), rather than strxfrm(), to
|
||||||
* abbreviate keys. The full comparator for the C locale is always
|
* abbreviate keys. The full comparator for the C locale is also
|
||||||
* memcmp(). It would be incorrect to allow bytea callers (callers that
|
* memcmp(). This should be faster than strxfrm().
|
||||||
* always force the C collation -- bytea isn't a collatable type, but this
|
|
||||||
* approach is convenient) to use strxfrm(). This is because bytea
|
|
||||||
* strings may contain NUL bytes. Besides, this should be faster, too.
|
|
||||||
*
|
|
||||||
* More generally, it's okay that bytea callers can have NUL bytes in
|
|
||||||
* strings because abbreviated cmp need not make a distinction between
|
|
||||||
* terminating NUL bytes, and NUL bytes representing actual NULs in the
|
|
||||||
* authoritative representation. Hopefully a comparison at or past one
|
|
||||||
* abbreviated key's terminating NUL byte will resolve the comparison
|
|
||||||
* without consulting the authoritative representation; specifically, some
|
|
||||||
* later non-NUL byte in the longer string can resolve the comparison
|
|
||||||
* against a subsequent terminating NUL in the shorter string. There will
|
|
||||||
* usually be what is effectively a "length-wise" resolution there and
|
|
||||||
* then.
|
|
||||||
*
|
|
||||||
* If that doesn't work out -- if all bytes in the longer string
|
|
||||||
* positioned at or past the offset of the smaller string's (first)
|
|
||||||
* terminating NUL are actually representative of NUL bytes in the
|
|
||||||
* authoritative binary string (perhaps with some *terminating* NUL bytes
|
|
||||||
* towards the end of the longer string iff it happens to still be small)
|
|
||||||
* -- then an authoritative tie-breaker will happen, and do the right
|
|
||||||
* thing: explicitly consider string length.
|
|
||||||
*/
|
*/
|
||||||
if (sss->collate_c)
|
if (sss->collate_c)
|
||||||
memcpy(pres, authoritative_data, Min(len, max_prefix_bytes));
|
memcpy(pres, authoritative_data, Min(len, max_prefix_bytes));
|
||||||
@@ -2115,9 +2091,6 @@ varstr_abbrev_convert(Datum original, SortSupport ssup)
|
|||||||
* strxfrm() blob is itself NUL terminated, leaving no danger of
|
* strxfrm() blob is itself NUL terminated, leaving no danger of
|
||||||
* misinterpreting any NUL bytes not intended to be interpreted as
|
* misinterpreting any NUL bytes not intended to be interpreted as
|
||||||
* logically representing termination.
|
* logically representing termination.
|
||||||
*
|
|
||||||
* (Actually, even if there were NUL bytes in the blob it would be
|
|
||||||
* okay. See remarks on bytea case above.)
|
|
||||||
*/
|
*/
|
||||||
memcpy(pres, sss->buf2, Min(max_prefix_bytes, bsize));
|
memcpy(pres, sss->buf2, Min(max_prefix_bytes, bsize));
|
||||||
}
|
}
|
||||||
@@ -2198,10 +2171,10 @@ varstr_abbrev_abort(int memtupcount, SortSupport ssup)
|
|||||||
* NULLs are generally disregarded, if only NULL values were seen so far,
|
* NULLs are generally disregarded, if only NULL values were seen so far,
|
||||||
* that might misrepresent costs if we failed to clamp.
|
* that might misrepresent costs if we failed to clamp.
|
||||||
*/
|
*/
|
||||||
if (abbrev_distinct <= 1.0)
|
if (abbrev_distinct < 1.0)
|
||||||
abbrev_distinct = 1.0;
|
abbrev_distinct = 1.0;
|
||||||
|
|
||||||
if (key_distinct <= 1.0)
|
if (key_distinct < 1.0)
|
||||||
key_distinct = 1.0;
|
key_distinct = 1.0;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
|||||||
@@ -366,6 +366,7 @@ BulkWriteBuffer
|
|||||||
BulkWriteState
|
BulkWriteState
|
||||||
BumpBlock
|
BumpBlock
|
||||||
BumpContext
|
BumpContext
|
||||||
|
ByteaSortSupport
|
||||||
CACHESIGN
|
CACHESIGN
|
||||||
CAC_state
|
CAC_state
|
||||||
CCFastEqualFN
|
CCFastEqualFN
|
||||||
|
|||||||
Reference in New Issue
Block a user