1
0
mirror of https://github.com/postgres/postgres.git synced 2025-08-27 07:42:10 +03:00

Use perfect hashing, instead of binary search, for keyword lookup.

We've been speculating for a long time that hash-based keyword lookup
ought to be faster than binary search, but up to now we hadn't found
a suitable tool for generating the hash function.  Joerg Sonnenberger
provided the inspiration, and sample code, to show us that rolling our
own generator wasn't a ridiculous idea.  Hence, do that.

The method used here requires a lookup table of approximately 4 bytes
per keyword, but that's less than what we saved in the predecessor commit
afb0d0712, so it's not a big problem.  The time savings is indeed
significant: preliminary testing suggests that the total time for raw
parsing (flex + bison phases) drops by ~20%.

Patch by me, but it owes its existence to Joerg Sonnenberger;
thanks also to John Naylor for review.

Discussion: https://postgr.es/m/20190103163340.GA15803@britannica.bec.de
This commit is contained in:
Tom Lane
2019-01-09 19:47:38 -05:00
parent 5d59a6c5ea
commit c64d0cd5ce
14 changed files with 520 additions and 111 deletions

View File

@@ -28,7 +28,10 @@ OBJS= preproc.o pgc.o type.o ecpg.o output.o parser.o \
keywords.o c_keywords.o ecpg_keywords.o typename.o descriptor.o variable.o \
$(WIN32RES)
GEN_KEYWORDLIST = $(top_srcdir)/src/tools/gen_keywordlist.pl
# where to find gen_keywordlist.pl and subsidiary files
TOOLSDIR = $(top_srcdir)/src/tools
GEN_KEYWORDLIST = $(PERL) -I $(TOOLSDIR) $(TOOLSDIR)/gen_keywordlist.pl
GEN_KEYWORDLIST_DEPS = $(TOOLSDIR)/gen_keywordlist.pl $(TOOLSDIR)/PerfectHash.pm
# Suppress parallel build to avoid a bug in GNU make 3.82
# (see comments in ../Makefile)
@@ -56,11 +59,11 @@ preproc.y: ../../../backend/parser/gram.y parse.pl ecpg.addons ecpg.header ecpg.
$(PERL) $(srcdir)/check_rules.pl $(srcdir) $<
# generate keyword headers
c_kwlist_d.h: c_kwlist.h $(GEN_KEYWORDLIST)
$(PERL) $(GEN_KEYWORDLIST) --varname ScanCKeywords $<
c_kwlist_d.h: c_kwlist.h $(GEN_KEYWORDLIST_DEPS)
$(GEN_KEYWORDLIST) --varname ScanCKeywords --no-case-fold $<
ecpg_kwlist_d.h: ecpg_kwlist.h $(GEN_KEYWORDLIST)
$(PERL) $(GEN_KEYWORDLIST) --varname ScanECPGKeywords $<
ecpg_kwlist_d.h: ecpg_kwlist.h $(GEN_KEYWORDLIST_DEPS)
$(GEN_KEYWORDLIST) --varname ScanECPGKeywords $<
# Force these dependencies to be known even without dependency info built:
ecpg_keywords.o c_keywords.o keywords.o preproc.o pgc.o parser.o: preproc.h

View File

@@ -9,8 +9,6 @@
*/
#include "postgres_fe.h"
#include <ctype.h>
#include "preproc_extern.h"
#include "preproc.h"
@@ -32,39 +30,38 @@ static const uint16 ScanCKeywordTokens[] = {
*
* Returns the token value of the keyword, or -1 if no match.
*
* Do a binary search using plain strcmp() comparison. This is much like
* Do a hash search using plain strcmp() comparison. This is much like
* ScanKeywordLookup(), except we want case-sensitive matching.
*/
int
ScanCKeywordLookup(const char *text)
ScanCKeywordLookup(const char *str)
{
const char *kw_string;
const uint16 *kw_offsets;
const uint16 *low;
const uint16 *high;
size_t len;
int h;
const char *kw;
if (strlen(text) > ScanCKeywords.max_kw_len)
return -1; /* too long to be any keyword */
/*
* Reject immediately if too long to be any keyword. This saves useless
* hashing work on long strings.
*/
len = strlen(str);
if (len > ScanCKeywords.max_kw_len)
return -1;
kw_string = ScanCKeywords.kw_string;
kw_offsets = ScanCKeywords.kw_offsets;
low = kw_offsets;
high = kw_offsets + (ScanCKeywords.num_keywords - 1);
/*
* Compute the hash function. Since it's a perfect hash, we need only
* match to the specific keyword it identifies.
*/
h = ScanCKeywords_hash_func(str, len);
while (low <= high)
{
const uint16 *middle;
int difference;
/* An out-of-range result implies no match */
if (h < 0 || h >= ScanCKeywords.num_keywords)
return -1;
middle = low + (high - low) / 2;
difference = strcmp(kw_string + *middle, text);
if (difference == 0)
return ScanCKeywordTokens[middle - kw_offsets];
else if (difference < 0)
low = middle + 1;
else
high = middle - 1;
}
kw = GetScanKeyword(h, &ScanCKeywords);
if (strcmp(kw, str) == 0)
return ScanCKeywordTokens[h];
return -1;
}

View File

@@ -20,8 +20,7 @@
/*
* List of (keyword-name, keyword-token-value) pairs.
*
* !!WARNING!!: This list must be sorted by ASCII name, because binary
* search is used to locate entries.
* Note: gen_keywordlist.pl requires the entries to appear in ASCII order.
*/
/* name, value */

View File

@@ -20,8 +20,7 @@
/*
* List of (keyword-name, keyword-token-value) pairs.
*
* !!WARNING!!: This list must be sorted by ASCII name, because binary
* search is used to locate entries.
* Note: gen_keywordlist.pl requires the entries to appear in ASCII order.
*/
/* name, value */