1
0
mirror of https://github.com/postgres/postgres.git synced 2025-08-27 07:42:10 +03:00

Use perfect hash for NFC and NFKC Unicode Normalization quick check

This makes the normalization quick check about 30% faster for NFC and
50% faster for NFKC than the binary search used previously.  The hash
lookup reuses the existing array of bit fields used for the binary
search to get the quick check property and is generated as part of "make
update-unicode" in src/common/unicode/.

Author: John Naylor
Reviewed-by: Mark Dilger, Michael Paquier
Discussion: https://postgr.es/m/CACPNZCt4fbJ0_bGrN5QPt34N4whv=mszM0LMVQdoa2rC9UMRXA@mail.gmail.com
This commit is contained in:
Michael Paquier
2020-10-11 19:09:01 +09:00
parent 85d08b8b72
commit 80f8eb79e2
5 changed files with 1681 additions and 24 deletions

View File

@@ -465,15 +465,32 @@ get_canonical_class(pg_wchar ch)
return entry->comb_class;
}
static int
qc_compare(const void *p1, const void *p2)
static const pg_unicode_normprops *
qc_hash_lookup(pg_wchar ch, const pg_unicode_norminfo *norminfo)
{
uint32 v1,
v2;
int h;
uint32 hashkey;
v1 = ((const pg_unicode_normprops *) p1)->codepoint;
v2 = ((const pg_unicode_normprops *) p2)->codepoint;
return (v1 - v2);
/*
* Compute the hash function. The hash key is the codepoint with the bytes
* in network order.
*/
hashkey = htonl(ch);
h = norminfo->hash(&hashkey);
/* An out-of-range result implies no match */
if (h < 0 || h >= norminfo->num_normprops)
return NULL;
/*
* Since it's a perfect hash, we need only match to the specific codepoint
* it identifies.
*/
if (ch != norminfo->normprops[h].codepoint)
return NULL;
/* Success! */
return &norminfo->normprops[h];
}
/*
@@ -482,26 +499,15 @@ qc_compare(const void *p1, const void *p2)
static UnicodeNormalizationQC
qc_is_allowed(UnicodeNormalizationForm form, pg_wchar ch)
{
pg_unicode_normprops key;
pg_unicode_normprops *found = NULL;
key.codepoint = ch;
const pg_unicode_normprops *found = NULL;
switch (form)
{
case UNICODE_NFC:
found = bsearch(&key,
UnicodeNormProps_NFC_QC,
lengthof(UnicodeNormProps_NFC_QC),
sizeof(pg_unicode_normprops),
qc_compare);
found = qc_hash_lookup(ch, &UnicodeNormInfo_NFC_QC);
break;
case UNICODE_NFKC:
found = bsearch(&key,
UnicodeNormProps_NFKC_QC,
lengthof(UnicodeNormProps_NFKC_QC),
sizeof(pg_unicode_normprops),
qc_compare);
found = qc_hash_lookup(ch, &UnicodeNormInfo_NFKC_QC);
break;
default:
Assert(false);