diff options
author | Tom Lane <tgl@sss.pgh.pa.us> | 2020-01-13 15:04:31 -0500 |
---|---|---|
committer | Tom Lane <tgl@sss.pgh.pa.us> | 2020-01-13 15:04:31 -0500 |
commit | 7f380c59f800f7e0fb49f45a6ff7787256851a59 (patch) | |
tree | 76743b1ec372574af81c2d1340180ef809b9a542 /src/interfaces/ecpg/preproc/parser.c | |
parent | 259bbe177808986e5d226ea7ce5a1ebb74657791 (diff) | |
download | postgresql-7f380c59f800f7e0fb49f45a6ff7787256851a59.tar.gz postgresql-7f380c59f800f7e0fb49f45a6ff7787256851a59.zip |
Reduce size of backend scanner's tables.
Previously, the core scanner's yy_transition[] array had 37045 elements.
Since that number is larger than INT16_MAX, Flex generated the array to
contain 32-bit integers. By reimplementing some of the bulkier scanner
rules, this patch reduces the array to 20495 elements. The much smaller
total length, combined with the consequent use of 16-bit integers for
the array elements reduces the binary size by over 200kB. This was
accomplished in two ways:
1. Consolidate handling of quote continuations into a new start condition,
rather than duplicating that logic for five different string types.
2. Treat Unicode strings and identifiers followed by a UESCAPE sequence
as three separate tokens, rather than one. The logic to de-escape
Unicode strings is moved to the filter code in parser.c, which already
had the ability to provide special processing for token sequences.
While we could have implemented the conversion in the grammar, that
approach was rejected for performance and maintainability reasons.
Performance in microbenchmarks of raw parsing seems equal or slightly
faster in most cases, and it's reasonable to expect that in real-world
usage (with more competition for the CPU cache) there will be a larger
win. The exception is UESCAPE sequences; lexing those is about 10%
slower, primarily because the scanner now has to be called three times
rather than one. This seems acceptable since that feature is very
rarely used.
The psql and epcg lexers are likewise modified, primarily because we
want to keep them all in sync. Since those lexers don't use the
space-hogging -CF option, the space savings is much less, but it's
still good for perhaps 10kB apiece.
While at it, merge the ecpg lexer's handling of C-style comments used
in SQL and in C. Those have different rules regarding nested comments,
but since we already have the ability to keep track of the previous
start condition, we can use that to handle both cases within a single
start condition. This matches the core scanner more closely.
John Naylor
Discussion: https://postgr.es/m/CACPNZCvaoa3EgVWm5yZhcSTX6RAtaLgniCPcBVOCwm8h3xpWkw@mail.gmail.com
Diffstat (limited to 'src/interfaces/ecpg/preproc/parser.c')
-rw-r--r-- | src/interfaces/ecpg/preproc/parser.c | 118 |
1 files changed, 93 insertions, 25 deletions
diff --git a/src/interfaces/ecpg/preproc/parser.c b/src/interfaces/ecpg/preproc/parser.c index c27de59828a..a2eeeba2174 100644 --- a/src/interfaces/ecpg/preproc/parser.c +++ b/src/interfaces/ecpg/preproc/parser.c @@ -6,6 +6,9 @@ * This should match src/backend/parser/parser.c, except that we do not * need to bother with re-entrant interfaces. * + * Note: ECPG doesn't report error location like the backend does. + * This file will need work if we ever want it to. + * * * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California @@ -27,8 +30,9 @@ static int lookahead_token; /* one-token lookahead */ static YYSTYPE lookahead_yylval; /* yylval for lookahead token */ static YYLTYPE lookahead_yylloc; /* yylloc for lookahead token */ static char *lookahead_yytext; /* start current token */ -static char *lookahead_end; /* end of current token */ -static char lookahead_hold_char; /* to be put back at *lookahead_end */ + +static bool check_uescapechar(unsigned char escape); +static bool ecpg_isspace(char ch); /* @@ -43,13 +47,16 @@ static char lookahead_hold_char; /* to be put back at *lookahead_end */ * words. Furthermore it's not clear how to do that without re-introducing * scanner backtrack, which would cost more performance than this filter * layer does. + * + * We also use this filter to convert UIDENT and USCONST sequences into + * plain IDENT and SCONST tokens. While that could be handled by additional + * productions in the main grammar, it's more efficient to do it like this. */ int filtered_base_yylex(void) { int cur_token; int next_token; - int cur_token_length; YYSTYPE cur_yylval; YYLTYPE cur_yylloc; char *cur_yytext; @@ -61,41 +68,26 @@ filtered_base_yylex(void) base_yylval = lookahead_yylval; base_yylloc = lookahead_yylloc; base_yytext = lookahead_yytext; - *lookahead_end = lookahead_hold_char; have_lookahead = false; } else cur_token = base_yylex(); /* - * If this token isn't one that requires lookahead, just return it. If it - * does, determine the token length. (We could get that via strlen(), but - * since we have such a small set of possibilities, hardwiring seems - * feasible and more efficient.) + * If this token isn't one that requires lookahead, just return it. */ switch (cur_token) { case NOT: - cur_token_length = 3; - break; case NULLS_P: - cur_token_length = 5; - break; case WITH: - cur_token_length = 4; + case UIDENT: + case USCONST: break; default: return cur_token; } - /* - * Identify end+1 of current token. base_yylex() has temporarily stored a - * '\0' here, and will undo that when we call it again. We need to redo - * it to fully revert the lookahead call for error reporting purposes. - */ - lookahead_end = base_yytext + cur_token_length; - Assert(*lookahead_end == '\0'); - /* Save and restore lexer output variables around the call */ cur_yylval = base_yylval; cur_yylloc = base_yylloc; @@ -113,10 +105,6 @@ filtered_base_yylex(void) base_yylloc = cur_yylloc; base_yytext = cur_yytext; - /* Now revert the un-truncation of the current token */ - lookahead_hold_char = *lookahead_end; - *lookahead_end = '\0'; - have_lookahead = true; /* Replace cur_token if needed, based on lookahead */ @@ -157,7 +145,87 @@ filtered_base_yylex(void) break; } break; + case UIDENT: + case USCONST: + /* Look ahead for UESCAPE */ + if (next_token == UESCAPE) + { + /* Yup, so get third token, which had better be SCONST */ + const char *escstr; + + /* + * Again save and restore lexer output variables around the + * call + */ + cur_yylval = base_yylval; + cur_yylloc = base_yylloc; + cur_yytext = base_yytext; + + /* Get third token */ + next_token = base_yylex(); + + if (next_token != SCONST) + mmerror(PARSE_ERROR, ET_ERROR, "UESCAPE must be followed by a simple string literal"); + + /* + * Save and check escape string, which the scanner returns + * with quotes + */ + escstr = base_yylval.str; + if (strlen(escstr) != 3 || !check_uescapechar(escstr[1])) + mmerror(PARSE_ERROR, ET_ERROR, "invalid Unicode escape character"); + + base_yylval = cur_yylval; + base_yylloc = cur_yylloc; + base_yytext = cur_yytext; + + /* Combine 3 tokens into 1 */ + base_yylval.str = psprintf("%s UESCAPE %s", base_yylval.str, escstr); + + /* Clear have_lookahead, thereby consuming all three tokens */ + have_lookahead = false; + } + + if (cur_token == UIDENT) + cur_token = IDENT; + else if (cur_token == USCONST) + cur_token = SCONST; + break; } return cur_token; } + +/* + * check_uescapechar() and ecpg_isspace() should match their equivalents + * in pgc.l. + */ + +/* is 'escape' acceptable as Unicode escape character (UESCAPE syntax) ? */ +static bool +check_uescapechar(unsigned char escape) +{ + if (isxdigit(escape) + || escape == '+' + || escape == '\'' + || escape == '"' + || ecpg_isspace(escape)) + return false; + else + return true; +} + +/* + * ecpg_isspace() --- return true if flex scanner considers char whitespace + */ +static bool +ecpg_isspace(char ch) +{ + if (ch == ' ' || + ch == '\t' || + ch == '\n' || + ch == '\r' || + ch == '\f') + return true; + return false; +} |