mirror of
https://github.com/postgres/postgres.git
synced 2025-07-02 09:02:37 +03:00
Use perfect hashing, instead of binary search, for keyword lookup.
We've been speculating for a long time that hash-based keyword lookup
ought to be faster than binary search, but up to now we hadn't found
a suitable tool for generating the hash function. Joerg Sonnenberger
provided the inspiration, and sample code, to show us that rolling our
own generator wasn't a ridiculous idea. Hence, do that.
The method used here requires a lookup table of approximately 4 bytes
per keyword, but that's less than what we saved in the predecessor commit
afb0d0712
, so it's not a big problem. The time savings is indeed
significant: preliminary testing suggests that the total time for raw
parsing (flex + bison phases) drops by ~20%.
Patch by me, but it owes its existence to Joerg Sonnenberger;
thanks also to John Naylor for review.
Discussion: https://postgr.es/m/20190103163340.GA15803@britannica.bec.de
This commit is contained in:
@ -63,6 +63,11 @@ OBJS_FRONTEND = $(OBJS_COMMON) fe_memutils.o file_utils.o restricted_token.o
|
||||
OBJS_SHLIB = $(OBJS_FRONTEND:%.o=%_shlib.o)
|
||||
OBJS_SRV = $(OBJS_COMMON:%.o=%_srv.o)
|
||||
|
||||
# where to find gen_keywordlist.pl and subsidiary files
|
||||
TOOLSDIR = $(top_srcdir)/src/tools
|
||||
GEN_KEYWORDLIST = $(PERL) -I $(TOOLSDIR) $(TOOLSDIR)/gen_keywordlist.pl
|
||||
GEN_KEYWORDLIST_DEPS = $(TOOLSDIR)/gen_keywordlist.pl $(TOOLSDIR)/PerfectHash.pm
|
||||
|
||||
all: libpgcommon.a libpgcommon_shlib.a libpgcommon_srv.a
|
||||
|
||||
distprep: kwlist_d.h
|
||||
@ -118,8 +123,8 @@ libpgcommon_srv.a: $(OBJS_SRV)
|
||||
$(CC) $(CFLAGS) $(subst -DFRONTEND,, $(CPPFLAGS)) -c $< -o $@
|
||||
|
||||
# generate SQL keyword lookup table to be included into keywords*.o.
|
||||
kwlist_d.h: $(top_srcdir)/src/include/parser/kwlist.h $(top_srcdir)/src/tools/gen_keywordlist.pl
|
||||
$(PERL) $(top_srcdir)/src/tools/gen_keywordlist.pl --extern $<
|
||||
kwlist_d.h: $(top_srcdir)/src/include/parser/kwlist.h $(GEN_KEYWORDLIST_DEPS)
|
||||
$(GEN_KEYWORDLIST) --extern $<
|
||||
|
||||
# Dependencies of keywords*.o need to be managed explicitly to make sure
|
||||
# that you don't get broken parsing code, even in a non-enable-depend build.
|
||||
|
@ -35,60 +35,51 @@
|
||||
* receive a different case-normalization mapping.
|
||||
*/
|
||||
int
|
||||
ScanKeywordLookup(const char *text,
|
||||
ScanKeywordLookup(const char *str,
|
||||
const ScanKeywordList *keywords)
|
||||
{
|
||||
int len,
|
||||
i;
|
||||
char word[NAMEDATALEN];
|
||||
const char *kw_string;
|
||||
const uint16 *kw_offsets;
|
||||
const uint16 *low;
|
||||
const uint16 *high;
|
||||
|
||||
len = strlen(text);
|
||||
|
||||
if (len > keywords->max_kw_len)
|
||||
return -1; /* too long to be any keyword */
|
||||
|
||||
/* We assume all keywords are shorter than NAMEDATALEN. */
|
||||
Assert(len < NAMEDATALEN);
|
||||
size_t len;
|
||||
int h;
|
||||
const char *kw;
|
||||
|
||||
/*
|
||||
* Apply an ASCII-only downcasing. We must not use tolower() since it may
|
||||
* produce the wrong translation in some locales (eg, Turkish).
|
||||
* Reject immediately if too long to be any keyword. This saves useless
|
||||
* hashing and downcasing work on long strings.
|
||||
*/
|
||||
for (i = 0; i < len; i++)
|
||||
len = strlen(str);
|
||||
if (len > keywords->max_kw_len)
|
||||
return -1;
|
||||
|
||||
/*
|
||||
* Compute the hash function. We assume it was generated to produce
|
||||
* case-insensitive results. Since it's a perfect hash, we need only
|
||||
* match to the specific keyword it identifies.
|
||||
*/
|
||||
h = keywords->hash(str, len);
|
||||
|
||||
/* An out-of-range result implies no match */
|
||||
if (h < 0 || h >= keywords->num_keywords)
|
||||
return -1;
|
||||
|
||||
/*
|
||||
* Compare character-by-character to see if we have a match, applying an
|
||||
* ASCII-only downcasing to the input characters. We must not use
|
||||
* tolower() since it may produce the wrong translation in some locales
|
||||
* (eg, Turkish).
|
||||
*/
|
||||
kw = GetScanKeyword(h, keywords);
|
||||
while (*str != '\0')
|
||||
{
|
||||
char ch = text[i];
|
||||
char ch = *str++;
|
||||
|
||||
if (ch >= 'A' && ch <= 'Z')
|
||||
ch += 'a' - 'A';
|
||||
word[i] = ch;
|
||||
if (ch != *kw++)
|
||||
return -1;
|
||||
}
|
||||
word[len] = '\0';
|
||||
if (*kw != '\0')
|
||||
return -1;
|
||||
|
||||
/*
|
||||
* Now do a binary search using plain strcmp() comparison.
|
||||
*/
|
||||
kw_string = keywords->kw_string;
|
||||
kw_offsets = keywords->kw_offsets;
|
||||
low = kw_offsets;
|
||||
high = kw_offsets + (keywords->num_keywords - 1);
|
||||
while (low <= high)
|
||||
{
|
||||
const uint16 *middle;
|
||||
int difference;
|
||||
|
||||
middle = low + (high - low) / 2;
|
||||
difference = strcmp(kw_string + *middle, word);
|
||||
if (difference == 0)
|
||||
return middle - kw_offsets;
|
||||
else if (difference < 0)
|
||||
low = middle + 1;
|
||||
else
|
||||
high = middle - 1;
|
||||
}
|
||||
|
||||
return -1;
|
||||
/* Success! */
|
||||
return h;
|
||||
}
|
||||
|
Reference in New Issue
Block a user