mirror of
https://github.com/postgres/postgres.git
synced 2025-08-27 07:42:10 +03:00
Use perfect hashing, instead of binary search, for keyword lookup.
We've been speculating for a long time that hash-based keyword lookup
ought to be faster than binary search, but up to now we hadn't found
a suitable tool for generating the hash function. Joerg Sonnenberger
provided the inspiration, and sample code, to show us that rolling our
own generator wasn't a ridiculous idea. Hence, do that.
The method used here requires a lookup table of approximately 4 bytes
per keyword, but that's less than what we saved in the predecessor commit
afb0d0712
, so it's not a big problem. The time savings is indeed
significant: preliminary testing suggests that the total time for raw
parsing (flex + bison phases) drops by ~20%.
Patch by me, but it owes its existence to Joerg Sonnenberger;
thanks also to John Naylor for review.
Discussion: https://postgr.es/m/20190103163340.GA15803@britannica.bec.de
This commit is contained in:
@@ -28,7 +28,10 @@ OBJS= preproc.o pgc.o type.o ecpg.o output.o parser.o \
|
||||
keywords.o c_keywords.o ecpg_keywords.o typename.o descriptor.o variable.o \
|
||||
$(WIN32RES)
|
||||
|
||||
GEN_KEYWORDLIST = $(top_srcdir)/src/tools/gen_keywordlist.pl
|
||||
# where to find gen_keywordlist.pl and subsidiary files
|
||||
TOOLSDIR = $(top_srcdir)/src/tools
|
||||
GEN_KEYWORDLIST = $(PERL) -I $(TOOLSDIR) $(TOOLSDIR)/gen_keywordlist.pl
|
||||
GEN_KEYWORDLIST_DEPS = $(TOOLSDIR)/gen_keywordlist.pl $(TOOLSDIR)/PerfectHash.pm
|
||||
|
||||
# Suppress parallel build to avoid a bug in GNU make 3.82
|
||||
# (see comments in ../Makefile)
|
||||
@@ -56,11 +59,11 @@ preproc.y: ../../../backend/parser/gram.y parse.pl ecpg.addons ecpg.header ecpg.
|
||||
$(PERL) $(srcdir)/check_rules.pl $(srcdir) $<
|
||||
|
||||
# generate keyword headers
|
||||
c_kwlist_d.h: c_kwlist.h $(GEN_KEYWORDLIST)
|
||||
$(PERL) $(GEN_KEYWORDLIST) --varname ScanCKeywords $<
|
||||
c_kwlist_d.h: c_kwlist.h $(GEN_KEYWORDLIST_DEPS)
|
||||
$(GEN_KEYWORDLIST) --varname ScanCKeywords --no-case-fold $<
|
||||
|
||||
ecpg_kwlist_d.h: ecpg_kwlist.h $(GEN_KEYWORDLIST)
|
||||
$(PERL) $(GEN_KEYWORDLIST) --varname ScanECPGKeywords $<
|
||||
ecpg_kwlist_d.h: ecpg_kwlist.h $(GEN_KEYWORDLIST_DEPS)
|
||||
$(GEN_KEYWORDLIST) --varname ScanECPGKeywords $<
|
||||
|
||||
# Force these dependencies to be known even without dependency info built:
|
||||
ecpg_keywords.o c_keywords.o keywords.o preproc.o pgc.o parser.o: preproc.h
|
||||
|
@@ -9,8 +9,6 @@
|
||||
*/
|
||||
#include "postgres_fe.h"
|
||||
|
||||
#include <ctype.h>
|
||||
|
||||
#include "preproc_extern.h"
|
||||
#include "preproc.h"
|
||||
|
||||
@@ -32,39 +30,38 @@ static const uint16 ScanCKeywordTokens[] = {
|
||||
*
|
||||
* Returns the token value of the keyword, or -1 if no match.
|
||||
*
|
||||
* Do a binary search using plain strcmp() comparison. This is much like
|
||||
* Do a hash search using plain strcmp() comparison. This is much like
|
||||
* ScanKeywordLookup(), except we want case-sensitive matching.
|
||||
*/
|
||||
int
|
||||
ScanCKeywordLookup(const char *text)
|
||||
ScanCKeywordLookup(const char *str)
|
||||
{
|
||||
const char *kw_string;
|
||||
const uint16 *kw_offsets;
|
||||
const uint16 *low;
|
||||
const uint16 *high;
|
||||
size_t len;
|
||||
int h;
|
||||
const char *kw;
|
||||
|
||||
if (strlen(text) > ScanCKeywords.max_kw_len)
|
||||
return -1; /* too long to be any keyword */
|
||||
/*
|
||||
* Reject immediately if too long to be any keyword. This saves useless
|
||||
* hashing work on long strings.
|
||||
*/
|
||||
len = strlen(str);
|
||||
if (len > ScanCKeywords.max_kw_len)
|
||||
return -1;
|
||||
|
||||
kw_string = ScanCKeywords.kw_string;
|
||||
kw_offsets = ScanCKeywords.kw_offsets;
|
||||
low = kw_offsets;
|
||||
high = kw_offsets + (ScanCKeywords.num_keywords - 1);
|
||||
/*
|
||||
* Compute the hash function. Since it's a perfect hash, we need only
|
||||
* match to the specific keyword it identifies.
|
||||
*/
|
||||
h = ScanCKeywords_hash_func(str, len);
|
||||
|
||||
while (low <= high)
|
||||
{
|
||||
const uint16 *middle;
|
||||
int difference;
|
||||
/* An out-of-range result implies no match */
|
||||
if (h < 0 || h >= ScanCKeywords.num_keywords)
|
||||
return -1;
|
||||
|
||||
middle = low + (high - low) / 2;
|
||||
difference = strcmp(kw_string + *middle, text);
|
||||
if (difference == 0)
|
||||
return ScanCKeywordTokens[middle - kw_offsets];
|
||||
else if (difference < 0)
|
||||
low = middle + 1;
|
||||
else
|
||||
high = middle - 1;
|
||||
}
|
||||
kw = GetScanKeyword(h, &ScanCKeywords);
|
||||
|
||||
if (strcmp(kw, str) == 0)
|
||||
return ScanCKeywordTokens[h];
|
||||
|
||||
return -1;
|
||||
}
|
||||
|
@@ -20,8 +20,7 @@
|
||||
/*
|
||||
* List of (keyword-name, keyword-token-value) pairs.
|
||||
*
|
||||
* !!WARNING!!: This list must be sorted by ASCII name, because binary
|
||||
* search is used to locate entries.
|
||||
* Note: gen_keywordlist.pl requires the entries to appear in ASCII order.
|
||||
*/
|
||||
|
||||
/* name, value */
|
||||
|
@@ -20,8 +20,7 @@
|
||||
/*
|
||||
* List of (keyword-name, keyword-token-value) pairs.
|
||||
*
|
||||
* !!WARNING!!: This list must be sorted by ASCII name, because binary
|
||||
* search is used to locate entries.
|
||||
* Note: gen_keywordlist.pl requires the entries to appear in ASCII order.
|
||||
*/
|
||||
|
||||
/* name, value */
|
||||
|
Reference in New Issue
Block a user