aboutsummaryrefslogtreecommitdiff
path: root/src/test/modules/test_regex/sql/test_regex_utf8.sql
diff options
context:
space:
mode:
authorTom Lane <tgl@sss.pgh.pa.us>2021-01-06 10:51:14 -0500
committerTom Lane <tgl@sss.pgh.pa.us>2021-01-06 10:51:14 -0500
commitca8217c10138fa3ffe1e7d1def2484fd0eb78226 (patch)
tree65e42fc9ad482246780e2dc1abadb502c68cf517 /src/test/modules/test_regex/sql/test_regex_utf8.sql
parent4656e3d66893f286767285cf74dabb3877068e49 (diff)
downloadpostgresql-ca8217c10138fa3ffe1e7d1def2484fd0eb78226.tar.gz
postgresql-ca8217c10138fa3ffe1e7d1def2484fd0eb78226.zip
Add a test module for the regular expression package.
This module provides a function test_regex() that is functionally rather like regexp_matches(), but with additional debugging-oriented options and additional output. The debug options are somewhat obscure; they are chosen to match the API of the test harness that Henry Spencer wrote way-back-when for use in Tcl. With this, we can import all the test cases that Spencer wrote originally, even for regex functionality that we don't currently expose in Postgres. This seems necessary because we can no longer rely on Tcl to act as upstream and verify any fixes or improvements that we make. In addition to Spencer's tests, I added a few for lookbehind constraints (which we added in 2015, and Tcl still hasn't absorbed) that are modeled on his tests for lookahead constraints. After looking at code coverage reports, I also threw in a couple of tests to more fully exercise our "high colormap" logic. According to my testing, this brings the check-world coverage for src/backend/regex/ from 71.1% to 86.7% of lines. (coverage.postgresql.org shows a slightly different number, which I think is because it measures a non-assert build.) Discussion: https://postgr.es/m/2873268.1609732164@sss.pgh.pa.us
Diffstat (limited to 'src/test/modules/test_regex/sql/test_regex_utf8.sql')
-rw-r--r--src/test/modules/test_regex/sql/test_regex_utf8.sql60
1 files changed, 60 insertions, 0 deletions
diff --git a/src/test/modules/test_regex/sql/test_regex_utf8.sql b/src/test/modules/test_regex/sql/test_regex_utf8.sql
new file mode 100644
index 00000000000..cfd9396194f
--- /dev/null
+++ b/src/test/modules/test_regex/sql/test_regex_utf8.sql
@@ -0,0 +1,60 @@
+/*
+ * This test must be run in a database with UTF-8 encoding,
+ * because other encodings don't support all the characters used.
+ */
+
+SELECT getdatabaseencoding() <> 'UTF8'
+ AS skip_test \gset
+\if :skip_test
+\quit
+\endif
+
+set client_encoding = utf8;
+
+set standard_conforming_strings = on;
+
+
+-- Run the Tcl test cases that require Unicode
+
+-- expectMatch 9.44 EMP* {a[\u00fe-\u0507][\u00ff-\u0300]b} \
+-- "a\u0102\u02ffb" "a\u0102\u02ffb"
+select * from test_regex('a[\u00fe-\u0507][\u00ff-\u0300]b', E'a\u0102\u02ffb', 'EMP*');
+
+-- expectMatch 13.27 P "a\\U00001234x" "a\u1234x" "a\u1234x"
+select * from test_regex('a\U00001234x', E'a\u1234x', 'P');
+-- expectMatch 13.28 P {a\U00001234x} "a\u1234x" "a\u1234x"
+select * from test_regex('a\U00001234x', E'a\u1234x', 'P');
+-- expectMatch 13.29 P "a\\U0001234x" "a\u1234x" "a\u1234x"
+-- Tcl has relaxed their code to allow 1-8 hex digits, but Postgres hasn't
+select * from test_regex('a\U0001234x', E'a\u1234x', 'P');
+-- expectMatch 13.30 P {a\U0001234x} "a\u1234x" "a\u1234x"
+-- Tcl has relaxed their code to allow 1-8 hex digits, but Postgres hasn't
+select * from test_regex('a\U0001234x', E'a\u1234x', 'P');
+-- expectMatch 13.31 P "a\\U000012345x" "a\u12345x" "a\u12345x"
+select * from test_regex('a\U000012345x', E'a\u12345x', 'P');
+-- expectMatch 13.32 P {a\U000012345x} "a\u12345x" "a\u12345x"
+select * from test_regex('a\U000012345x', E'a\u12345x', 'P');
+-- expectMatch 13.33 P "a\\U1000000x" "a\ufffd0x" "a\ufffd0x"
+-- Tcl allows this as a standalone character, but Postgres doesn't
+select * from test_regex('a\U1000000x', E'a\ufffd0x', 'P');
+-- expectMatch 13.34 P {a\U1000000x} "a\ufffd0x" "a\ufffd0x"
+-- Tcl allows this as a standalone character, but Postgres doesn't
+select * from test_regex('a\U1000000x', E'a\ufffd0x', 'P');
+
+
+-- Additional tests, not derived from Tcl
+
+-- Exercise logic around high character ranges a bit more
+select * from test_regex('a
+ [\u1000-\u1100]*
+ [\u3000-\u3100]*
+ [\u1234-\u25ff]+
+ [\u2000-\u35ff]*
+ [\u2600-\u2f00]*
+ \u1236\u1236x',
+ E'a\u1234\u1236\u1236x', 'xEMP');
+
+select * from test_regex('[[:alnum:]]*[[:upper:]]*[\u1000-\u2000]*\u1237',
+ E'\u1500\u1237', 'ELMP');
+select * from test_regex('[[:alnum:]]*[[:upper:]]*[\u1000-\u2000]*\u1237',
+ E'A\u1239', 'ELMP');