1
0
mirror of https://github.com/postgres/postgres.git synced 2025-07-02 09:02:37 +03:00

Use perfect hashing, instead of binary search, for keyword lookup.

We've been speculating for a long time that hash-based keyword lookup
ought to be faster than binary search, but up to now we hadn't found
a suitable tool for generating the hash function.  Joerg Sonnenberger
provided the inspiration, and sample code, to show us that rolling our
own generator wasn't a ridiculous idea.  Hence, do that.

The method used here requires a lookup table of approximately 4 bytes
per keyword, but that's less than what we saved in the predecessor commit
afb0d0712, so it's not a big problem.  The time savings is indeed
significant: preliminary testing suggests that the total time for raw
parsing (flex + bison phases) drops by ~20%.

Patch by me, but it owes its existence to Joerg Sonnenberger;
thanks also to John Naylor for review.

Discussion: https://postgr.es/m/20190103163340.GA15803@britannica.bec.de
This commit is contained in:
Tom Lane
2019-01-09 19:47:38 -05:00
parent 5d59a6c5ea
commit c64d0cd5ce
14 changed files with 520 additions and 111 deletions

View File

@ -63,6 +63,11 @@ OBJS_FRONTEND = $(OBJS_COMMON) fe_memutils.o file_utils.o restricted_token.o
OBJS_SHLIB = $(OBJS_FRONTEND:%.o=%_shlib.o)
OBJS_SRV = $(OBJS_COMMON:%.o=%_srv.o)
# where to find gen_keywordlist.pl and subsidiary files
TOOLSDIR = $(top_srcdir)/src/tools
GEN_KEYWORDLIST = $(PERL) -I $(TOOLSDIR) $(TOOLSDIR)/gen_keywordlist.pl
GEN_KEYWORDLIST_DEPS = $(TOOLSDIR)/gen_keywordlist.pl $(TOOLSDIR)/PerfectHash.pm
all: libpgcommon.a libpgcommon_shlib.a libpgcommon_srv.a
distprep: kwlist_d.h
@ -118,8 +123,8 @@ libpgcommon_srv.a: $(OBJS_SRV)
$(CC) $(CFLAGS) $(subst -DFRONTEND,, $(CPPFLAGS)) -c $< -o $@
# generate SQL keyword lookup table to be included into keywords*.o.
kwlist_d.h: $(top_srcdir)/src/include/parser/kwlist.h $(top_srcdir)/src/tools/gen_keywordlist.pl
$(PERL) $(top_srcdir)/src/tools/gen_keywordlist.pl --extern $<
kwlist_d.h: $(top_srcdir)/src/include/parser/kwlist.h $(GEN_KEYWORDLIST_DEPS)
$(GEN_KEYWORDLIST) --extern $<
# Dependencies of keywords*.o need to be managed explicitly to make sure
# that you don't get broken parsing code, even in a non-enable-depend build.

View File

@ -35,60 +35,51 @@
* receive a different case-normalization mapping.
*/
int
ScanKeywordLookup(const char *text,
ScanKeywordLookup(const char *str,
const ScanKeywordList *keywords)
{
int len,
i;
char word[NAMEDATALEN];
const char *kw_string;
const uint16 *kw_offsets;
const uint16 *low;
const uint16 *high;
len = strlen(text);
if (len > keywords->max_kw_len)
return -1; /* too long to be any keyword */
/* We assume all keywords are shorter than NAMEDATALEN. */
Assert(len < NAMEDATALEN);
size_t len;
int h;
const char *kw;
/*
* Apply an ASCII-only downcasing. We must not use tolower() since it may
* produce the wrong translation in some locales (eg, Turkish).
* Reject immediately if too long to be any keyword. This saves useless
* hashing and downcasing work on long strings.
*/
for (i = 0; i < len; i++)
len = strlen(str);
if (len > keywords->max_kw_len)
return -1;
/*
* Compute the hash function. We assume it was generated to produce
* case-insensitive results. Since it's a perfect hash, we need only
* match to the specific keyword it identifies.
*/
h = keywords->hash(str, len);
/* An out-of-range result implies no match */
if (h < 0 || h >= keywords->num_keywords)
return -1;
/*
* Compare character-by-character to see if we have a match, applying an
* ASCII-only downcasing to the input characters. We must not use
* tolower() since it may produce the wrong translation in some locales
* (eg, Turkish).
*/
kw = GetScanKeyword(h, keywords);
while (*str != '\0')
{
char ch = text[i];
char ch = *str++;
if (ch >= 'A' && ch <= 'Z')
ch += 'a' - 'A';
word[i] = ch;
if (ch != *kw++)
return -1;
}
word[len] = '\0';
if (*kw != '\0')
return -1;
/*
* Now do a binary search using plain strcmp() comparison.
*/
kw_string = keywords->kw_string;
kw_offsets = keywords->kw_offsets;
low = kw_offsets;
high = kw_offsets + (keywords->num_keywords - 1);
while (low <= high)
{
const uint16 *middle;
int difference;
middle = low + (high - low) / 2;
difference = strcmp(kw_string + *middle, word);
if (difference == 0)
return middle - kw_offsets;
else if (difference < 0)
low = middle + 1;
else
high = middle - 1;
}
return -1;
/* Success! */
return h;
}