Suppress unnecessary regex subre nodes in a couple more cases.

This extends the changes made in commit cebc1d34e, teaching parseqatom() to generate fewer or cheaper subre nodes in some edge cases. The case of interest here is a quantified atom that is "messy" only because it has greediness opposite to what preceded it (whereas captures and backrefs are intrinsically messy). In this case we don't need an iteration node, since we don't care where the sub-matches of the quantifier are; and we might also not need a second concatenation node. This seems of only marginal real-world use according to my testing, but I wanted to get it in before wrapping up this series of regex performance fixes. Discussion: https://postgr.es/m/1340281.1613018383@sss.pgh.pa.us
author: Tom Lane <tgl@sss.pgh.pa.us> 2021-03-02 12:14:14 -0500
committer: Tom Lane <tgl@sss.pgh.pa.us> 2021-03-02 12:14:14 -0500
commit: 4604f83fdfe030a2f1984159ede5270c1d825310 (patch)
tree: 75d9ab0ac8006905246dabfbef7b6ff1d71e8f31 /src/backend/regex/regcomp.c
parent: 0c3405cf11a12da1a4278c6833f4d979fe06c866 (diff)
download: postgresql-4604f83fdfe030a2f1984159ede5270c1d825310.tar.gz
postgresql-4604f83fdfe030a2f1984159ede5270c1d825310.zip
1 files changed, 39 insertions, 0 deletions
diff --git a/src/backend/regex/regcomp.c b/src/backend/regex/regcomp.c
index 3c7627a955e..ba8dd864645 100644
--- a/src/backend/regex/regcomp.c
+++ b/src/backend/regex/regcomp.c
@@ -1216,6 +1216,23 @@ parseqatom(struct vars *v,
 		/* rest of branch can be strung starting from atom->end */
 		s2 = atom->end;
 	}
+	else if (!(atom->flags & (CAP | BACKR)))
+	{
+		/*
+		 * If there's no captures nor backrefs in the atom being repeated, we
+		 * don't really care where the submatches of the iteration are, so we
+		 * don't need an iteration node.  Make a plain DFA node instead.
+		 */
+		EMPTYARC(s, atom->begin);	/* empty prefix */
+		repeat(v, atom->begin, atom->end, m, n);
+		f = COMBINE(qprefer, atom->flags);
+		t = subre(v, '=', f, atom->begin, atom->end);
+		NOERR();
+		freesubre(v, atom);
+		*atomp = t;
+		/* rest of branch can be strung starting from t->end */
+		s2 = t->end;
+	}
 	else if (m > 0 && !(atom->flags & BACKR))
 	{
 		/*
@@ -1271,6 +1288,10 @@ parseqatom(struct vars *v,
 		t->flags |= COMBINE(t->flags, t->child->sibling->flags);
 		top->flags |= COMBINE(top->flags, t->flags);
 
+		/* neither t nor top could be directly marked for capture as yet */
+		assert(t->capno == 0);
+		assert(top->capno == 0);
+
 		/*
 		 * At this point both top and t are concatenation (op == '.') subres,
 		 * and we have top->child = prefix of branch, top->child->sibling = t,
@@ -1291,6 +1312,24 @@ parseqatom(struct vars *v,
 			top->child = t->child;
 			freesrnode(v, t);
 		}
+
+		/*
+		 * Otherwise, it's possible that t->child is not messy in itself, but
+		 * we considered it messy because its greediness conflicts with what
+		 * preceded it.  Then it could be that the combination of t->child and
+		 * the rest of the branch is also not messy, in which case we can get
+		 * rid of the child concatenation by merging t->child and the rest of
+		 * the branch into one plain DFA node.
+		 */
+		else if (t->child->op == '=' &&
+				 t->child->sibling->op == '=' &&
+				 !MESSY(UP(t->child->flags | t->child->sibling->flags)))
+		{
+			t->op = '=';
+			t->flags = COMBINE(t->child->flags, t->child->sibling->flags);
+			freesubreandsiblings(v, t->child);
+			t->child = NULL;
+		}
 	}
 	else
 	{
author	Tom Lane <tgl@sss.pgh.pa.us>	2021-03-02 12:14:14 -0500
committer	Tom Lane <tgl@sss.pgh.pa.us>	2021-03-02 12:14:14 -0500
commit	4604f83fdfe030a2f1984159ede5270c1d825310 (patch)
tree	75d9ab0ac8006905246dabfbef7b6ff1d71e8f31 /src/backend/regex/regcomp.c
parent	0c3405cf11a12da1a4278c6833f4d979fe06c866 (diff)
download	postgresql-4604f83fdfe030a2f1984159ede5270c1d825310.tar.gz postgresql-4604f83fdfe030a2f1984159ede5270c1d825310.zip