aboutsummaryrefslogtreecommitdiff
path: root/src/backend/regex
diff options
context:
space:
mode:
authorTom Lane <tgl@sss.pgh.pa.us>2021-05-03 11:42:31 -0400
committerTom Lane <tgl@sss.pgh.pa.us>2021-05-03 11:42:31 -0400
commitf68970e33f4dc48094c24c78c452ad730ae9ae12 (patch)
tree192ae33fa21801492e9d1833f8a151d2f24d6ea7 /src/backend/regex
parentb94409a02f6122d77b5154e481c0819fed6b4c95 (diff)
downloadpostgresql-f68970e33f4dc48094c24c78c452ad730ae9ae12.tar.gz
postgresql-f68970e33f4dc48094c24c78c452ad730ae9ae12.zip
Fix performance issue in new regex match-all detection code.
Commit 824bf7190 introduced a new search of the NFAs generated by regex compilation. I failed to think hard about the performance characteristics of that search, with the predictable outcome that it's bad: weird regexes can trigger exponential search time. Worse, there's no check-for-interrupt in that code, so you can't even cancel the query if this happens. Fix by introducing memo-ization of the search results, so that any one NFA state need be examined in detail just once. This potentially uses a lot of memory, but we can bound the memory usage by putting a limit on the number of states for which we'll try to prove match-all-ness. That is sane because we already have a limit (DUPINF) on the maximum finite string length that a matchall regex can match; and patterns that involve much more than DUPINF states would probably exceed that limit anyway. Also, rearrange the logic so that we check the basic is-the-graph- all-RAINBOW-arcs property before we start the recursive search to determine path lengths. This will ensure that we fall out quickly whenever the NFA couldn't possibly be matchall. Also stick in a check-for-interrupt, just in case these measures don't completely eliminate the risk of slowness. Discussion: https://postgr.es/m/3483895.1619898362@sss.pgh.pa.us
Diffstat (limited to 'src/backend/regex')
-rw-r--r--src/backend/regex/regc_nfa.c481
-rw-r--r--src/backend/regex/regcomp.c3
2 files changed, 321 insertions, 163 deletions
diff --git a/src/backend/regex/regc_nfa.c b/src/backend/regex/regc_nfa.c
index 77b860cb0fd..6d77c59e121 100644
--- a/src/backend/regex/regc_nfa.c
+++ b/src/backend/regex/regc_nfa.c
@@ -3032,96 +3032,189 @@ analyze(struct nfa *nfa)
static void
checkmatchall(struct nfa *nfa)
{
- bool hasmatch[DUPINF + 1];
- int minmatch,
- maxmatch,
- morematch;
+ bool **haspaths;
+ struct state *s;
+ int i;
/*
- * hasmatch[i] will be set true if a match of length i is feasible, for i
- * from 0 to DUPINF-1. hasmatch[DUPINF] will be set true if every match
- * length of DUPINF or more is feasible.
+ * If there are too many states, don't bother trying to detect matchall.
+ * This limit serves to bound the time and memory we could consume below.
+ * Note that even if the graph is all-RAINBOW, if there are significantly
+ * more than DUPINF states then it's likely that there are paths of length
+ * more than DUPINF, which would force us to fail anyhow. In practice,
+ * plausible ways of writing a matchall regex with maximum finite path
+ * length K tend not to have very many more than K states.
*/
- memset(hasmatch, 0, sizeof(hasmatch));
+ if (nfa->nstates > DUPINF * 2)
+ return;
/*
- * Recursively search the graph for all-RAINBOW paths to the "post" state,
- * starting at the "pre" state. The -1 initial depth accounts for the
- * fact that transitions out of the "pre" state are not part of the
- * matched string. We likewise don't count the final transition to the
- * "post" state as part of the match length. (But we still insist that
- * those transitions have RAINBOW arcs, otherwise there are lookbehind or
- * lookahead constraints at the start/end of the pattern.)
+ * First, scan all the states to verify that only RAINBOW arcs appear,
+ * plus pseudocolor arcs adjacent to the pre and post states. This lets
+ * us quickly eliminate most cases that aren't matchall NFAs.
*/
- if (!checkmatchall_recurse(nfa, nfa->pre, false, -1, hasmatch))
- return;
+ for (s = nfa->states; s != NULL; s = s->next)
+ {
+ struct arc *a;
+
+ for (a = s->outs; a != NULL; a = a->outchain)
+ {
+ if (a->type != PLAIN)
+ return; /* any LACONs make it non-matchall */
+ if (a->co != RAINBOW)
+ {
+ if (nfa->cm->cd[a->co].flags & PSEUDO)
+ {
+ /*
+ * Pseudocolor arc: verify it's in a valid place (this
+ * seems quite unlikely to fail, but let's be sure).
+ */
+ if (s == nfa->pre &&
+ (a->co == nfa->bos[0] || a->co == nfa->bos[1]))
+ /* okay BOS/BOL arc */ ;
+ else if (a->to == nfa->post &&
+ (a->co == nfa->eos[0] || a->co == nfa->eos[1]))
+ /* okay EOS/EOL arc */ ;
+ else
+ return; /* unexpected pseudocolor arc */
+ /* We'll check these arcs some more below. */
+ }
+ else
+ return; /* any other color makes it non-matchall */
+ }
+ }
+ /* Also, assert that the tmp fields are available for use. */
+ assert(s->tmp == NULL);
+ }
/*
- * We found some all-RAINBOW paths, and not anything that we couldn't
- * handle. Now verify that pseudocolor arcs adjacent to the pre and post
- * states match the RAINBOW arcs there. (We could do this while
- * recursing, but it's expensive and unlikely to fail, so do it last.)
+ * The next cheapest check we can make is to verify that the BOS/BOL
+ * outarcs of the pre state reach the same states as its RAINBOW outarcs.
+ * If they don't, the NFA expresses some constraints on the character
+ * before the matched string, making it non-matchall. Likewise, the
+ * EOS/EOL inarcs of the post state must match its RAINBOW inarcs.
*/
if (!check_out_colors_match(nfa->pre, RAINBOW, nfa->bos[0]) ||
- !check_out_colors_match(nfa->pre, nfa->bos[0], RAINBOW) ||
!check_out_colors_match(nfa->pre, RAINBOW, nfa->bos[1]) ||
- !check_out_colors_match(nfa->pre, nfa->bos[1], RAINBOW))
- return;
- if (!check_in_colors_match(nfa->post, RAINBOW, nfa->eos[0]) ||
- !check_in_colors_match(nfa->post, nfa->eos[0], RAINBOW) ||
- !check_in_colors_match(nfa->post, RAINBOW, nfa->eos[1]) ||
- !check_in_colors_match(nfa->post, nfa->eos[1], RAINBOW))
+ !check_in_colors_match(nfa->post, RAINBOW, nfa->eos[0]) ||
+ !check_in_colors_match(nfa->post, RAINBOW, nfa->eos[1]))
return;
/*
- * hasmatch[] now represents the set of possible match lengths; but we
- * want to reduce that to a min and max value, because it doesn't seem
- * worth complicating regexec.c to deal with nonconsecutive possible match
- * lengths. Find min and max of first run of lengths, then verify there
- * are no nonconsecutive lengths.
+ * Initialize an array of path-length arrays, in which
+ * checkmatchall_recurse will return per-state results. This lets us
+ * memo-ize the recursive search and avoid exponential time consumption.
*/
- for (minmatch = 0; minmatch <= DUPINF; minmatch++)
- {
- if (hasmatch[minmatch])
- break;
- }
- assert(minmatch <= DUPINF); /* else checkmatchall_recurse lied */
- for (maxmatch = minmatch; maxmatch < DUPINF; maxmatch++)
+ haspaths = (bool **) MALLOC(nfa->nstates * sizeof(bool *));
+ if (haspaths == NULL)
+ return; /* fail quietly */
+ memset(haspaths, 0, nfa->nstates * sizeof(bool *));
+
+ /*
+ * Recursively search the graph for all-RAINBOW paths to the "post" state,
+ * starting at the "pre" state, and computing the lengths of the paths.
+ * (Given the preceding checks, there should be at least one such path.
+ * However we could get back a false result anyway, in case there are
+ * multi-state loops, paths exceeding DUPINF+1 length, or non-algorithmic
+ * failures such as ENOMEM.)
+ */
+ if (checkmatchall_recurse(nfa, nfa->pre, haspaths))
{
- if (!hasmatch[maxmatch + 1])
- break;
+ /* The useful result is the path length array for the pre state */
+ bool *haspath = haspaths[nfa->pre->no];
+ int minmatch,
+ maxmatch,
+ morematch;
+
+ assert(haspath != NULL);
+
+ /*
+ * haspath[] now represents the set of possible path lengths; but we
+ * want to reduce that to a min and max value, because it doesn't seem
+ * worth complicating regexec.c to deal with nonconsecutive possible
+ * match lengths. Find min and max of first run of lengths, then
+ * verify there are no nonconsecutive lengths.
+ */
+ for (minmatch = 0; minmatch <= DUPINF + 1; minmatch++)
+ {
+ if (haspath[minmatch])
+ break;
+ }
+ assert(minmatch <= DUPINF + 1); /* else checkmatchall_recurse lied */
+ for (maxmatch = minmatch; maxmatch < DUPINF + 1; maxmatch++)
+ {
+ if (!haspath[maxmatch + 1])
+ break;
+ }
+ for (morematch = maxmatch + 1; morematch <= DUPINF + 1; morematch++)
+ {
+ if (haspath[morematch])
+ {
+ haspath = NULL; /* fail, there are nonconsecutive lengths */
+ break;
+ }
+ }
+
+ if (haspath != NULL)
+ {
+ /*
+ * Success, so record the info. Here we have a fine point: the
+ * path length from the pre state includes the pre-to-initial
+ * transition, so it's one more than the actually matched string
+ * length. (We avoided counting the final-to-post transition
+ * within checkmatchall_recurse, but not this one.) This is why
+ * checkmatchall_recurse allows one more level of path length than
+ * might seem necessary. This decrement also takes care of
+ * converting checkmatchall_recurse's definition of "infinity" as
+ * "DUPINF+1" to our normal representation as "DUPINF".
+ */
+ assert(minmatch > 0); /* else pre and post states were adjacent */
+ nfa->minmatchall = minmatch - 1;
+ nfa->maxmatchall = maxmatch - 1;
+ nfa->flags |= MATCHALL;
+ }
}
- for (morematch = maxmatch + 1; morematch <= DUPINF; morematch++)
+
+ /* Clean up */
+ for (i = 0; i < nfa->nstates; i++)
{
- if (hasmatch[morematch])
- return; /* fail, there are nonconsecutive lengths */
+ if (haspaths[i] != NULL)
+ FREE(haspaths[i]);
}
-
- /* Success, so record the info */
- nfa->minmatchall = minmatch;
- nfa->maxmatchall = maxmatch;
- nfa->flags |= MATCHALL;
+ FREE(haspaths);
}
/*
* checkmatchall_recurse - recursive search for checkmatchall
*
- * s is the current state
- * foundloop is true if any predecessor state has a loop-to-self
- * depth is the current recursion depth (starting at -1)
- * hasmatch[] is the output area for recording feasible match lengths
+ * s is the state to be examined in this recursion level.
+ * haspaths[] is an array of per-state exit path length arrays.
+ *
+ * We return true if the search was performed successfully, false if
+ * we had to fail because of multi-state loops or other internal reasons.
+ * (Because "dead" states that can't reach the post state have been
+ * eliminated, and we already verified that only RAINBOW and matching
+ * pseudocolor arcs exist, every state should have RAINBOW path(s) to
+ * the post state. Hence we take a false result from recursive calls
+ * as meaning that we'd better fail altogether, not just that that
+ * particular state can't reach the post state.)
*
- * We return true if there is at least one all-RAINBOW path to the "post"
- * state and no non-matchall paths; otherwise false. Note we assume that
- * any dead-end paths have already been removed, else we might return
- * false unnecessarily.
+ * On success, we store a malloc'd result array in haspaths[s->no],
+ * showing the possible path lengths from s to the post state.
+ * Each state's haspath[] array is of length DUPINF+2. The entries from
+ * k = 0 to DUPINF are true if there is an all-RAINBOW path of length k
+ * from this state to the string end. haspath[DUPINF+1] is true if all
+ * path lengths >= DUPINF+1 are possible. (Situations that cannot be
+ * represented under these rules cause failure.)
+ *
+ * checkmatchall is responsible for eventually freeing the haspath[] arrays.
*/
static bool
-checkmatchall_recurse(struct nfa *nfa, struct state *s,
- bool foundloop, int depth,
- bool *hasmatch)
+checkmatchall_recurse(struct nfa *nfa, struct state *s, bool **haspaths)
{
bool result = false;
+ bool foundloop = false;
+ bool *haspath;
struct arc *a;
/*
@@ -3131,61 +3224,20 @@ checkmatchall_recurse(struct nfa *nfa, struct state *s,
if (STACK_TOO_DEEP(nfa->v->re))
return false;
- /*
- * Likewise, if we get to a depth too large to represent correctly in
- * maxmatchall, fail quietly.
- */
- if (depth >= DUPINF)
- return false;
-
- /*
- * Scan the outarcs to detect cases we can't handle, and to see if there
- * is a loop-to-self here. We need to know about any such loop before we
- * recurse, so it's hard to avoid making two passes over the outarcs. In
- * any case, checking for showstoppers before we recurse is probably best.
- */
- for (a = s->outs; a != NULL; a = a->outchain)
+ /* In case the search takes a long time, check for cancel */
+ if (CANCEL_REQUESTED(nfa->v->re))
{
- if (a->type != PLAIN)
- return false; /* any LACONs make it non-matchall */
- if (a->co != RAINBOW)
- {
- if (nfa->cm->cd[a->co].flags & PSEUDO)
- {
- /*
- * Pseudocolor arc: verify it's in a valid place (this seems
- * quite unlikely to fail, but let's be sure).
- */
- if (s == nfa->pre &&
- (a->co == nfa->bos[0] || a->co == nfa->bos[1]))
- /* okay BOS/BOL arc */ ;
- else if (a->to == nfa->post &&
- (a->co == nfa->eos[0] || a->co == nfa->eos[1]))
- /* okay EOS/EOL arc */ ;
- else
- return false; /* unexpected pseudocolor arc */
- /* We'll finish checking these arcs after the recursion */
- continue;
- }
- return false; /* any other color makes it non-matchall */
- }
- if (a->to == s)
- {
- /*
- * We found a cycle of length 1, so remember that to pass down to
- * successor states. (It doesn't matter if there was also such a
- * loop at a predecessor state.)
- */
- foundloop = true;
- }
- else if (a->to->tmp)
- {
- /* We found a cycle of length > 1, so fail. */
- return false;
- }
+ NERR(REG_CANCEL);
+ return false;
}
- /* We need to recurse, so mark state as under consideration */
+ /* Create a haspath array for this state */
+ haspath = (bool *) MALLOC((DUPINF + 2) * sizeof(bool));
+ if (haspath == NULL)
+ return false; /* again, treat as non-matchall */
+ memset(haspath, 0, (DUPINF + 2) * sizeof(bool));
+
+ /* Mark this state as being visited */
assert(s->tmp == NULL);
s->tmp = s;
@@ -3197,95 +3249,202 @@ checkmatchall_recurse(struct nfa *nfa, struct state *s,
{
/* We found an all-RAINBOW path to the post state */
result = true;
- /* ... which should not be adjacent to the pre state */
- if (depth < 0)
+
+ /*
+ * Mark this state as being zero steps away from the string end
+ * (the transition to the post state isn't counted).
+ */
+ haspath[0] = true;
+ }
+ else if (a->to == s)
+ {
+ /* We found a cycle of length 1, which we'll deal with below. */
+ foundloop = true;
+ }
+ else if (a->to->tmp != NULL)
+ {
+ /* It's busy, so we found a cycle of length > 1, so fail. */
+ result = false;
+ break;
+ }
+ else
+ {
+ /* Consider paths forward through this to-state. */
+ bool *nexthaspath;
+ int i;
+
+ /* If to-state was not already visited, recurse */
+ if (haspaths[a->to->no] == NULL)
{
- NERR(REG_ASSERT);
- return false;
+ result = checkmatchall_recurse(nfa, a->to, haspaths);
+ /* Fail if any recursive path fails */
+ if (!result)
+ break;
}
- /* Record potential match lengths */
- hasmatch[depth] = true;
- if (foundloop)
+ else
{
- /* A predecessor loop makes all larger lengths match, too */
- int i;
+ /* The previous visit must have found path(s) to the end */
+ result = true;
+ }
+ assert(a->to->tmp == NULL);
+ nexthaspath = haspaths[a->to->no];
+ assert(nexthaspath != NULL);
- for (i = depth + 1; i <= DUPINF; i++)
- hasmatch[i] = true;
+ /*
+ * Now, for every path of length i from a->to to the string end,
+ * there is a path of length i + 1 from s to the string end.
+ */
+ if (nexthaspath[DUPINF] != nexthaspath[DUPINF + 1])
+ {
+ /*
+ * a->to has a path of length exactly DUPINF, but not longer;
+ * or it has paths of all lengths > DUPINF but not one of
+ * exactly that length. In either case, we cannot represent
+ * the possible path lengths from s correctly, so fail.
+ */
+ result = false;
+ break;
}
+ /* Merge knowledge of these path lengths into what we have */
+ for (i = 0; i < DUPINF; i++)
+ haspath[i + 1] |= nexthaspath[i];
+ /* Infinity + 1 is still infinity */
+ haspath[DUPINF + 1] |= nexthaspath[DUPINF + 1];
}
- else if (a->to != s)
+ }
+
+ if (result && foundloop)
+ {
+ /*
+ * If there is a length-1 loop at this state, then find the shortest
+ * known path length to the end. The loop means that every larger
+ * path length is possible, too. (It doesn't matter whether any of
+ * the longer lengths were already known possible.)
+ */
+ int i;
+
+ for (i = 0; i <= DUPINF; i++)
{
- /* This is a new path forward; recurse to investigate */
- result = checkmatchall_recurse(nfa, a->to,
- foundloop, depth + 1,
- hasmatch);
- /* Fail if any recursive path fails */
- if (!result)
+ if (haspath[i])
break;
}
+ for (i++; i <= DUPINF + 1; i++)
+ haspath[i] = true;
}
+ /* Report out the completed path length map */
+ assert(s->no < nfa->nstates);
+ assert(haspaths[s->no] == NULL);
+ haspaths[s->no] = haspath;
+
+ /* Mark state no longer busy */
s->tmp = NULL;
+
return result;
}
/*
* check_out_colors_match - subroutine for checkmatchall
*
- * Check if every s outarc of color co1 has a matching outarc of color co2.
- * (checkmatchall_recurse already verified that all of the outarcs are PLAIN,
- * so we need not examine arc types here. Also, since it verified that there
- * are only RAINBOW and pseudocolor arcs, there shouldn't be enough arcs for
- * this brute-force O(N^2) implementation to cause problems.)
+ * Check whether the set of states reachable from s by arcs of color co1
+ * is equivalent to the set reachable by arcs of color co2.
+ * checkmatchall already verified that all of the NFA's arcs are PLAIN,
+ * so we need not examine arc types here.
*/
static bool
check_out_colors_match(struct state *s, color co1, color co2)
{
- struct arc *a1;
- struct arc *a2;
+ bool result = true;
+ struct arc *a;
- for (a1 = s->outs; a1 != NULL; a1 = a1->outchain)
+ /*
+ * To do this in linear time, we assume that the NFA contains no duplicate
+ * arcs. Run through the out-arcs, marking states reachable by arcs of
+ * color co1. Run through again, un-marking states reachable by arcs of
+ * color co2; if we see a not-marked state, we know this co2 arc is
+ * unmatched. Then run through again, checking for still-marked states,
+ * and in any case leaving all the tmp fields reset to NULL.
+ */
+ for (a = s->outs; a != NULL; a = a->outchain)
{
- if (a1->co != co1)
- continue;
- for (a2 = s->outs; a2 != NULL; a2 = a2->outchain)
+ if (a->co == co1)
{
- if (a2->co == co2 && a2->to == a1->to)
- break;
+ assert(a->to->tmp == NULL);
+ a->to->tmp = a->to;
+ }
+ }
+ for (a = s->outs; a != NULL; a = a->outchain)
+ {
+ if (a->co == co2)
+ {
+ if (a->to->tmp != NULL)
+ a->to->tmp = NULL;
+ else
+ result = false; /* unmatched co2 arc */
}
- if (a2 == NULL)
- return false;
}
- return true;
+ for (a = s->outs; a != NULL; a = a->outchain)
+ {
+ if (a->co == co1)
+ {
+ if (a->to->tmp != NULL)
+ {
+ result = false; /* unmatched co1 arc */
+ a->to->tmp = NULL;
+ }
+ }
+ }
+ return result;
}
/*
* check_in_colors_match - subroutine for checkmatchall
*
- * Check if every s inarc of color co1 has a matching inarc of color co2.
- * (For paranoia's sake, ignore any non-PLAIN arcs here. But we're still
- * not expecting very many arcs.)
+ * Check whether the set of states that can reach s by arcs of color co1
+ * is equivalent to the set that can reach s by arcs of color co2.
+ * checkmatchall already verified that all of the NFA's arcs are PLAIN,
+ * so we need not examine arc types here.
*/
static bool
check_in_colors_match(struct state *s, color co1, color co2)
{
- struct arc *a1;
- struct arc *a2;
+ bool result = true;
+ struct arc *a;
- for (a1 = s->ins; a1 != NULL; a1 = a1->inchain)
+ /*
+ * Identical algorithm to check_out_colors_match, except examine the
+ * from-states of s' inarcs.
+ */
+ for (a = s->ins; a != NULL; a = a->inchain)
{
- if (a1->type != PLAIN || a1->co != co1)
- continue;
- for (a2 = s->ins; a2 != NULL; a2 = a2->inchain)
+ if (a->co == co1)
{
- if (a2->type == PLAIN && a2->co == co2 && a2->from == a1->from)
- break;
+ assert(a->from->tmp == NULL);
+ a->from->tmp = a->from;
}
- if (a2 == NULL)
- return false;
}
- return true;
+ for (a = s->ins; a != NULL; a = a->inchain)
+ {
+ if (a->co == co2)
+ {
+ if (a->from->tmp != NULL)
+ a->from->tmp = NULL;
+ else
+ result = false; /* unmatched co2 arc */
+ }
+ }
+ for (a = s->ins; a != NULL; a = a->inchain)
+ {
+ if (a->co == co1)
+ {
+ if (a->from->tmp != NULL)
+ {
+ result = false; /* unmatched co1 arc */
+ a->from->tmp = NULL;
+ }
+ }
+ }
+ return result;
}
/*
diff --git a/src/backend/regex/regcomp.c b/src/backend/regex/regcomp.c
index ba8dd864645..9f71177d318 100644
--- a/src/backend/regex/regcomp.c
+++ b/src/backend/regex/regcomp.c
@@ -182,8 +182,7 @@ static void markreachable(struct nfa *, struct state *, struct state *, struct s
static void markcanreach(struct nfa *, struct state *, struct state *, struct state *);
static long analyze(struct nfa *);
static void checkmatchall(struct nfa *);
-static bool checkmatchall_recurse(struct nfa *, struct state *,
- bool, int, bool *);
+static bool checkmatchall_recurse(struct nfa *, struct state *, bool **);
static bool check_out_colors_match(struct state *, color, color);
static bool check_in_colors_match(struct state *, color, color);
static void compact(struct nfa *, struct cnfa *);