diff options
author | Tom Lane <tgl@sss.pgh.pa.us> | 2021-02-20 19:26:41 -0500 |
---|---|---|
committer | Tom Lane <tgl@sss.pgh.pa.us> | 2021-02-20 19:26:41 -0500 |
commit | ea1268f6301cc7adce571cc9c5ebe8d9342a2ef4 (patch) | |
tree | 036b342d06097d3118a31dc9b3f65a5756c7730c /src/include/regex/regex.h | |
parent | 581043089472816061a7fd381f40572191dfa48f (diff) | |
download | postgresql-ea1268f6301cc7adce571cc9c5ebe8d9342a2ef4.tar.gz postgresql-ea1268f6301cc7adce571cc9c5ebe8d9342a2ef4.zip |
Avoid generating extra subre tree nodes for capturing parentheses.
Previously, each pair of capturing parentheses gave rise to a separate
subre tree node, whose only function was to identify that we ought to
capture the match details for this particular sub-expression. In
most cases we don't really need that, since we can perfectly well
put a "capture this" annotation on the child node that does the real
matching work. As with the two preceding commits, the main value
of this is to avoid generating and optimizing an NFA for a tree node
that's not really pulling its weight.
The chosen data representation only allows one capture annotation
per subre node. In the legal-per-spec, but seemingly not very useful,
case where there are multiple capturing parens around the exact same
bit of the regex (i.e. "((xyz))"), wrap the child node in N-1 capture
nodes that act the same as before. We could work harder at that but
I'll refrain, pending some evidence that such cases are worth troubling
over.
In passing, improve the comments in regex.h to say what all the
different re_info bits mean. Some of them were pretty obvious
but others not so much, so reverse-engineer some documentation.
This is part of a patch series that in total reduces the regex engine's
runtime by about a factor of four on a large corpus of real-world regexes.
Patch by me, reviewed by Joel Jacobson
Discussion: https://postgr.es/m/1340281.1613018383@sss.pgh.pa.us
Diffstat (limited to 'src/include/regex/regex.h')
-rw-r--r-- | src/include/regex/regex.h | 32 |
1 files changed, 17 insertions, 15 deletions
diff --git a/src/include/regex/regex.h b/src/include/regex/regex.h index dc31899aa4d..2b48a19fb7d 100644 --- a/src/include/regex/regex.h +++ b/src/include/regex/regex.h @@ -56,21 +56,23 @@ typedef struct { int re_magic; /* magic number */ size_t re_nsub; /* number of subexpressions */ - long re_info; /* information about RE */ -#define REG_UBACKREF 000001 -#define REG_ULOOKAROUND 000002 -#define REG_UBOUNDS 000004 -#define REG_UBRACES 000010 -#define REG_UBSALNUM 000020 -#define REG_UPBOTCH 000040 -#define REG_UBBS 000100 -#define REG_UNONPOSIX 000200 -#define REG_UUNSPEC 000400 -#define REG_UUNPORT 001000 -#define REG_ULOCALE 002000 -#define REG_UEMPTYMATCH 004000 -#define REG_UIMPOSSIBLE 010000 -#define REG_USHORTEST 020000 + long re_info; /* bitmask of the following flags: */ +#define REG_UBACKREF 000001 /* has back-reference (\n) */ +#define REG_ULOOKAROUND 000002 /* has lookahead/lookbehind constraint */ +#define REG_UBOUNDS 000004 /* has bounded quantifier ({m,n}) */ +#define REG_UBRACES 000010 /* has { that doesn't begin a quantifier */ +#define REG_UBSALNUM 000020 /* has backslash-alphanumeric in non-ARE */ +#define REG_UPBOTCH 000040 /* has unmatched right paren in ERE (legal + * per spec, but that was a mistake) */ +#define REG_UBBS 000100 /* has backslash within bracket expr */ +#define REG_UNONPOSIX 000200 /* has any construct that extends POSIX */ +#define REG_UUNSPEC 000400 /* has any case disallowed by POSIX, e.g. + * an empty branch */ +#define REG_UUNPORT 001000 /* has numeric character code dependency */ +#define REG_ULOCALE 002000 /* has locale dependency */ +#define REG_UEMPTYMATCH 004000 /* can match a zero-length string */ +#define REG_UIMPOSSIBLE 010000 /* provably cannot match anything */ +#define REG_USHORTEST 020000 /* has non-greedy quantifier */ int re_csize; /* sizeof(character) */ char *re_endp; /* backward compatibility kludge */ Oid re_collation; /* Collation that defines LC_CTYPE behavior */ |