1
0
mirror of https://github.com/postgres/postgres.git synced 2025-04-25 21:42:33 +03:00
John Naylor 6974924347 Specialize tuplesort routines for different kinds of abbreviated keys
Previously, the specialized tuplesort routine inlined handling for
reverse-sort and NULLs-ordering but called the datum comparator via a
pointer in the SortSupport struct parameter. Testing has showed that we
can get a useful performance gain by specializing datum comparison for
the different representations of abbreviated keys -- signed and unsigned
64-bit integers and signed 32-bit integers. Almost all abbreviatable data
types will benefit -- the only exception for now is numeric, since the
datum comparison is more complex. The performance gain depends on data
type and input distribution, but often falls in the range of 10-20% faster.

Thomas Munro

Reviewed by Peter Geoghegan, review and performance testing by me

Discussion:
https://www.postgresql.org/message-id/CA%2BhUKGKKYttZZk-JMRQSVak%3DCXSJ5fiwtirFf%3Dn%3DPAbumvn1Ww%40mail.gmail.com
2022-04-02 15:22:25 +07:00

424 lines
9.8 KiB
C

/*-------------------------------------------------------------------------
*
* uuid.c
* Functions for the built-in type "uuid".
*
* Copyright (c) 2007-2022, PostgreSQL Global Development Group
*
* IDENTIFICATION
* src/backend/utils/adt/uuid.c
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "common/hashfn.h"
#include "lib/hyperloglog.h"
#include "libpq/pqformat.h"
#include "port/pg_bswap.h"
#include "utils/builtins.h"
#include "utils/guc.h"
#include "utils/sortsupport.h"
#include "utils/uuid.h"
/* sortsupport for uuid */
typedef struct
{
int64 input_count; /* number of non-null values seen */
bool estimating; /* true if estimating cardinality */
hyperLogLogState abbr_card; /* cardinality estimator */
} uuid_sortsupport_state;
static void string_to_uuid(const char *source, pg_uuid_t *uuid);
static int uuid_internal_cmp(const pg_uuid_t *arg1, const pg_uuid_t *arg2);
static int uuid_fast_cmp(Datum x, Datum y, SortSupport ssup);
static bool uuid_abbrev_abort(int memtupcount, SortSupport ssup);
static Datum uuid_abbrev_convert(Datum original, SortSupport ssup);
Datum
uuid_in(PG_FUNCTION_ARGS)
{
char *uuid_str = PG_GETARG_CSTRING(0);
pg_uuid_t *uuid;
uuid = (pg_uuid_t *) palloc(sizeof(*uuid));
string_to_uuid(uuid_str, uuid);
PG_RETURN_UUID_P(uuid);
}
Datum
uuid_out(PG_FUNCTION_ARGS)
{
pg_uuid_t *uuid = PG_GETARG_UUID_P(0);
static const char hex_chars[] = "0123456789abcdef";
StringInfoData buf;
int i;
initStringInfo(&buf);
for (i = 0; i < UUID_LEN; i++)
{
int hi;
int lo;
/*
* We print uuid values as a string of 8, 4, 4, 4, and then 12
* hexadecimal characters, with each group is separated by a hyphen
* ("-"). Therefore, add the hyphens at the appropriate places here.
*/
if (i == 4 || i == 6 || i == 8 || i == 10)
appendStringInfoChar(&buf, '-');
hi = uuid->data[i] >> 4;
lo = uuid->data[i] & 0x0F;
appendStringInfoChar(&buf, hex_chars[hi]);
appendStringInfoChar(&buf, hex_chars[lo]);
}
PG_RETURN_CSTRING(buf.data);
}
/*
* We allow UUIDs as a series of 32 hexadecimal digits with an optional dash
* after each group of 4 hexadecimal digits, and optionally surrounded by {}.
* (The canonical format 8x-4x-4x-4x-12x, where "nx" means n hexadecimal
* digits, is the only one used for output.)
*/
static void
string_to_uuid(const char *source, pg_uuid_t *uuid)
{
const char *src = source;
bool braces = false;
int i;
if (src[0] == '{')
{
src++;
braces = true;
}
for (i = 0; i < UUID_LEN; i++)
{
char str_buf[3];
if (src[0] == '\0' || src[1] == '\0')
goto syntax_error;
memcpy(str_buf, src, 2);
if (!isxdigit((unsigned char) str_buf[0]) ||
!isxdigit((unsigned char) str_buf[1]))
goto syntax_error;
str_buf[2] = '\0';
uuid->data[i] = (unsigned char) strtoul(str_buf, NULL, 16);
src += 2;
if (src[0] == '-' && (i % 2) == 1 && i < UUID_LEN - 1)
src++;
}
if (braces)
{
if (*src != '}')
goto syntax_error;
src++;
}
if (*src != '\0')
goto syntax_error;
return;
syntax_error:
ereport(ERROR,
(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
errmsg("invalid input syntax for type %s: \"%s\"",
"uuid", source)));
}
Datum
uuid_recv(PG_FUNCTION_ARGS)
{
StringInfo buffer = (StringInfo) PG_GETARG_POINTER(0);
pg_uuid_t *uuid;
uuid = (pg_uuid_t *) palloc(UUID_LEN);
memcpy(uuid->data, pq_getmsgbytes(buffer, UUID_LEN), UUID_LEN);
PG_RETURN_POINTER(uuid);
}
Datum
uuid_send(PG_FUNCTION_ARGS)
{
pg_uuid_t *uuid = PG_GETARG_UUID_P(0);
StringInfoData buffer;
pq_begintypsend(&buffer);
pq_sendbytes(&buffer, (char *) uuid->data, UUID_LEN);
PG_RETURN_BYTEA_P(pq_endtypsend(&buffer));
}
/* internal uuid compare function */
static int
uuid_internal_cmp(const pg_uuid_t *arg1, const pg_uuid_t *arg2)
{
return memcmp(arg1->data, arg2->data, UUID_LEN);
}
Datum
uuid_lt(PG_FUNCTION_ARGS)
{
pg_uuid_t *arg1 = PG_GETARG_UUID_P(0);
pg_uuid_t *arg2 = PG_GETARG_UUID_P(1);
PG_RETURN_BOOL(uuid_internal_cmp(arg1, arg2) < 0);
}
Datum
uuid_le(PG_FUNCTION_ARGS)
{
pg_uuid_t *arg1 = PG_GETARG_UUID_P(0);
pg_uuid_t *arg2 = PG_GETARG_UUID_P(1);
PG_RETURN_BOOL(uuid_internal_cmp(arg1, arg2) <= 0);
}
Datum
uuid_eq(PG_FUNCTION_ARGS)
{
pg_uuid_t *arg1 = PG_GETARG_UUID_P(0);
pg_uuid_t *arg2 = PG_GETARG_UUID_P(1);
PG_RETURN_BOOL(uuid_internal_cmp(arg1, arg2) == 0);
}
Datum
uuid_ge(PG_FUNCTION_ARGS)
{
pg_uuid_t *arg1 = PG_GETARG_UUID_P(0);
pg_uuid_t *arg2 = PG_GETARG_UUID_P(1);
PG_RETURN_BOOL(uuid_internal_cmp(arg1, arg2) >= 0);
}
Datum
uuid_gt(PG_FUNCTION_ARGS)
{
pg_uuid_t *arg1 = PG_GETARG_UUID_P(0);
pg_uuid_t *arg2 = PG_GETARG_UUID_P(1);
PG_RETURN_BOOL(uuid_internal_cmp(arg1, arg2) > 0);
}
Datum
uuid_ne(PG_FUNCTION_ARGS)
{
pg_uuid_t *arg1 = PG_GETARG_UUID_P(0);
pg_uuid_t *arg2 = PG_GETARG_UUID_P(1);
PG_RETURN_BOOL(uuid_internal_cmp(arg1, arg2) != 0);
}
/* handler for btree index operator */
Datum
uuid_cmp(PG_FUNCTION_ARGS)
{
pg_uuid_t *arg1 = PG_GETARG_UUID_P(0);
pg_uuid_t *arg2 = PG_GETARG_UUID_P(1);
PG_RETURN_INT32(uuid_internal_cmp(arg1, arg2));
}
/*
* Sort support strategy routine
*/
Datum
uuid_sortsupport(PG_FUNCTION_ARGS)
{
SortSupport ssup = (SortSupport) PG_GETARG_POINTER(0);
ssup->comparator = uuid_fast_cmp;
ssup->ssup_extra = NULL;
if (ssup->abbreviate)
{
uuid_sortsupport_state *uss;
MemoryContext oldcontext;
oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt);
uss = palloc(sizeof(uuid_sortsupport_state));
uss->input_count = 0;
uss->estimating = true;
initHyperLogLog(&uss->abbr_card, 10);
ssup->ssup_extra = uss;
ssup->comparator = ssup_datum_unsigned_cmp;
ssup->abbrev_converter = uuid_abbrev_convert;
ssup->abbrev_abort = uuid_abbrev_abort;
ssup->abbrev_full_comparator = uuid_fast_cmp;
MemoryContextSwitchTo(oldcontext);
}
PG_RETURN_VOID();
}
/*
* SortSupport comparison func
*/
static int
uuid_fast_cmp(Datum x, Datum y, SortSupport ssup)
{
pg_uuid_t *arg1 = DatumGetUUIDP(x);
pg_uuid_t *arg2 = DatumGetUUIDP(y);
return uuid_internal_cmp(arg1, arg2);
}
/*
* Callback for estimating effectiveness of abbreviated key optimization.
*
* We pay no attention to the cardinality of the non-abbreviated data, because
* there is no equality fast-path within authoritative uuid comparator.
*/
static bool
uuid_abbrev_abort(int memtupcount, SortSupport ssup)
{
uuid_sortsupport_state *uss = ssup->ssup_extra;
double abbr_card;
if (memtupcount < 10000 || uss->input_count < 10000 || !uss->estimating)
return false;
abbr_card = estimateHyperLogLog(&uss->abbr_card);
/*
* If we have >100k distinct values, then even if we were sorting many
* billion rows we'd likely still break even, and the penalty of undoing
* that many rows of abbrevs would probably not be worth it. Stop even
* counting at that point.
*/
if (abbr_card > 100000.0)
{
#ifdef TRACE_SORT
if (trace_sort)
elog(LOG,
"uuid_abbrev: estimation ends at cardinality %f"
" after " INT64_FORMAT " values (%d rows)",
abbr_card, uss->input_count, memtupcount);
#endif
uss->estimating = false;
return false;
}
/*
* Target minimum cardinality is 1 per ~2k of non-null inputs. 0.5 row
* fudge factor allows us to abort earlier on genuinely pathological data
* where we've had exactly one abbreviated value in the first 2k
* (non-null) rows.
*/
if (abbr_card < uss->input_count / 2000.0 + 0.5)
{
#ifdef TRACE_SORT
if (trace_sort)
elog(LOG,
"uuid_abbrev: aborting abbreviation at cardinality %f"
" below threshold %f after " INT64_FORMAT " values (%d rows)",
abbr_card, uss->input_count / 2000.0 + 0.5, uss->input_count,
memtupcount);
#endif
return true;
}
#ifdef TRACE_SORT
if (trace_sort)
elog(LOG,
"uuid_abbrev: cardinality %f after " INT64_FORMAT
" values (%d rows)", abbr_card, uss->input_count, memtupcount);
#endif
return false;
}
/*
* Conversion routine for sortsupport. Converts original uuid representation
* to abbreviated key representation. Our encoding strategy is simple -- pack
* the first `sizeof(Datum)` bytes of uuid data into a Datum (on little-endian
* machines, the bytes are stored in reverse order), and treat it as an
* unsigned integer.
*/
static Datum
uuid_abbrev_convert(Datum original, SortSupport ssup)
{
uuid_sortsupport_state *uss = ssup->ssup_extra;
pg_uuid_t *authoritative = DatumGetUUIDP(original);
Datum res;
memcpy(&res, authoritative->data, sizeof(Datum));
uss->input_count += 1;
if (uss->estimating)
{
uint32 tmp;
#if SIZEOF_DATUM == 8
tmp = (uint32) res ^ (uint32) ((uint64) res >> 32);
#else /* SIZEOF_DATUM != 8 */
tmp = (uint32) res;
#endif
addHyperLogLog(&uss->abbr_card, DatumGetUInt32(hash_uint32(tmp)));
}
/*
* Byteswap on little-endian machines.
*
* This is needed so that ssup_datum_unsigned_cmp() (an unsigned integer
* 3-way comparator) works correctly on all platforms. If we didn't do
* this, the comparator would have to call memcmp() with a pair of pointers
* to the first byte of each abbreviated key, which is slower.
*/
res = DatumBigEndianToNative(res);
return res;
}
/* hash index support */
Datum
uuid_hash(PG_FUNCTION_ARGS)
{
pg_uuid_t *key = PG_GETARG_UUID_P(0);
return hash_any(key->data, UUID_LEN);
}
Datum
uuid_hash_extended(PG_FUNCTION_ARGS)
{
pg_uuid_t *key = PG_GETARG_UUID_P(0);
return hash_any_extended(key->data, UUID_LEN, PG_GETARG_INT64(1));
}
Datum
gen_random_uuid(PG_FUNCTION_ARGS)
{
pg_uuid_t *uuid = palloc(UUID_LEN);
if (!pg_strong_random(uuid, UUID_LEN))
ereport(ERROR,
(errcode(ERRCODE_INTERNAL_ERROR),
errmsg("could not generate random values")));
/*
* Set magic numbers for a "version 4" (pseudorandom) UUID, see
* http://tools.ietf.org/html/rfc4122#section-4.4
*/
uuid->data[6] = (uuid->data[6] & 0x0f) | 0x40; /* time_hi_and_version */
uuid->data[8] = (uuid->data[8] & 0x3f) | 0x80; /* clock_seq_hi_and_reserved */
PG_RETURN_UUID_P(uuid);
}