aboutsummaryrefslogtreecommitdiff
path: root/src/interfaces/ecpg/preproc/c_keywords.c
diff options
context:
space:
mode:
authorTom Lane <tgl@sss.pgh.pa.us>2019-01-09 19:47:38 -0500
committerTom Lane <tgl@sss.pgh.pa.us>2019-01-09 19:47:46 -0500
commitc64d0cd5ce24a344798534f1bc5827a9199b7a6e (patch)
tree3968456d54c3f18d07976e5a139ca60589a8fbf0 /src/interfaces/ecpg/preproc/c_keywords.c
parent5d59a6c5eaff4a58322683e450e76a11d943d322 (diff)
downloadpostgresql-c64d0cd5ce24a344798534f1bc5827a9199b7a6e.tar.gz
postgresql-c64d0cd5ce24a344798534f1bc5827a9199b7a6e.zip
Use perfect hashing, instead of binary search, for keyword lookup.
We've been speculating for a long time that hash-based keyword lookup ought to be faster than binary search, but up to now we hadn't found a suitable tool for generating the hash function. Joerg Sonnenberger provided the inspiration, and sample code, to show us that rolling our own generator wasn't a ridiculous idea. Hence, do that. The method used here requires a lookup table of approximately 4 bytes per keyword, but that's less than what we saved in the predecessor commit afb0d0712, so it's not a big problem. The time savings is indeed significant: preliminary testing suggests that the total time for raw parsing (flex + bison phases) drops by ~20%. Patch by me, but it owes its existence to Joerg Sonnenberger; thanks also to John Naylor for review. Discussion: https://postgr.es/m/20190103163340.GA15803@britannica.bec.de
Diffstat (limited to 'src/interfaces/ecpg/preproc/c_keywords.c')
-rw-r--r--src/interfaces/ecpg/preproc/c_keywords.c51
1 files changed, 24 insertions, 27 deletions
diff --git a/src/interfaces/ecpg/preproc/c_keywords.c b/src/interfaces/ecpg/preproc/c_keywords.c
index 38ddf6f1359..80aa7d5339c 100644
--- a/src/interfaces/ecpg/preproc/c_keywords.c
+++ b/src/interfaces/ecpg/preproc/c_keywords.c
@@ -9,8 +9,6 @@
*/
#include "postgres_fe.h"
-#include <ctype.h>
-
#include "preproc_extern.h"
#include "preproc.h"
@@ -32,39 +30,38 @@ static const uint16 ScanCKeywordTokens[] = {
*
* Returns the token value of the keyword, or -1 if no match.
*
- * Do a binary search using plain strcmp() comparison. This is much like
+ * Do a hash search using plain strcmp() comparison. This is much like
* ScanKeywordLookup(), except we want case-sensitive matching.
*/
int
-ScanCKeywordLookup(const char *text)
+ScanCKeywordLookup(const char *str)
{
- const char *kw_string;
- const uint16 *kw_offsets;
- const uint16 *low;
- const uint16 *high;
+ size_t len;
+ int h;
+ const char *kw;
+
+ /*
+ * Reject immediately if too long to be any keyword. This saves useless
+ * hashing work on long strings.
+ */
+ len = strlen(str);
+ if (len > ScanCKeywords.max_kw_len)
+ return -1;
- if (strlen(text) > ScanCKeywords.max_kw_len)
- return -1; /* too long to be any keyword */
+ /*
+ * Compute the hash function. Since it's a perfect hash, we need only
+ * match to the specific keyword it identifies.
+ */
+ h = ScanCKeywords_hash_func(str, len);
- kw_string = ScanCKeywords.kw_string;
- kw_offsets = ScanCKeywords.kw_offsets;
- low = kw_offsets;
- high = kw_offsets + (ScanCKeywords.num_keywords - 1);
+ /* An out-of-range result implies no match */
+ if (h < 0 || h >= ScanCKeywords.num_keywords)
+ return -1;
- while (low <= high)
- {
- const uint16 *middle;
- int difference;
+ kw = GetScanKeyword(h, &ScanCKeywords);
- middle = low + (high - low) / 2;
- difference = strcmp(kw_string + *middle, text);
- if (difference == 0)
- return ScanCKeywordTokens[middle - kw_offsets];
- else if (difference < 0)
- low = middle + 1;
- else
- high = middle - 1;
- }
+ if (strcmp(kw, str) == 0)
+ return ScanCKeywordTokens[h];
return -1;
}