Add caching of ctype.h/wctype.h results in regc_locale.c.

While this doesn't save a huge amount of runtime, it still seems worth doing, especially since I realized that the data copying I did in my first draft was quite unnecessary. In this version, once we have the results cached, getting them back for re-use is really very cheap. Also, remove the hard-wired limitation to not consider wctype.h results for character codes above 255. It turns out that we can't push the limit as far up as I'd originally hoped, because the regex colormap code is not efficient enough to cope very well with character classes containing many thousand letters, which a Unicode locale is entirely capable of producing. Still, we can push it up to U+7FF (which I chose as the limit of 2-byte UTF8 characters), which will at least make Eastern Europeans happy pending a better solution. Thus, this commit resolves the specific complaint in bug #6457, but not the more general issue that letters of non-western alphabets are mostly not recognized as matching [[:alpha:]].
author: Tom Lane <tgl@sss.pgh.pa.us> 2012-02-19 21:01:13 -0500
committer: Tom Lane <tgl@sss.pgh.pa.us> 2012-02-19 21:01:13 -0500
commit: e00f68e49c148851187136d3278b7e9afa370537 (patch)
tree: eb2902e66d525e3ce96aa60f2193bf25f6b87425 /src/backend/regex/regc_locale.c
parent: 27af91438b68f46f4015853b6f75c6f5c3a8650c (diff)
download: postgresql-e00f68e49c148851187136d3278b7e9afa370537.tar.gz
postgresql-e00f68e49c148851187136d3278b7e9afa370537.zip
1 files changed, 39 insertions, 80 deletions
diff --git a/src/backend/regex/regc_locale.c b/src/backend/regex/regc_locale.c
index 6cf27958b15..c0414a24912 100644
--- a/src/backend/regex/regc_locale.c
+++ b/src/backend/regex/regc_locale.c
@@ -351,6 +351,16 @@ static const struct cname
 
 
 /*
+ * We do not use the hard-wired Unicode classification tables that Tcl does.
+ * This is because (a) we need to deal with other encodings besides Unicode,
+ * and (b) we want to track the behavior of the libc locale routines as
+ * closely as possible.  For example, it wouldn't be unreasonable for a
+ * locale to not consider every Unicode letter as a letter.  So we build
+ * character classification cvecs by asking libc, even for Unicode.
+ */
+
+
+/*
  * element - map collating-element name to celt
  */
 static celt
@@ -489,7 +499,11 @@ eclass(struct vars * v,			/* context */
 /*
  * cclass - supply cvec for a character class
  *
- * Must include case counterparts on request.
+ * Must include case counterparts if "cases" is true.
+ *
+ * The returned cvec might be either a transient cvec gotten from getcvec(),
+ * or a permanently cached one from pg_ctype_get_cache().  This is okay
+ * because callers are not supposed to explicitly free the result either way.
  */
 static struct cvec *
 cclass(struct vars * v,			/* context */
@@ -548,79 +562,54 @@ cclass(struct vars * v,			/* context */
 		index = (int) CC_ALPHA;
 
 	/*
-	 * Now compute the character class contents.
-	 *
-	 * For the moment, assume that only char codes < 256 can be in these
-	 * classes.
+	 * Now compute the character class contents.  For classes that are
+	 * based on the behavior of a <wctype.h> or <ctype.h> function, we use
+	 * pg_ctype_get_cache so that we can cache the results.  Other classes
+	 * have definitions that are hard-wired here, and for those we just
+	 * construct a transient cvec on the fly.
 	 */
 
 	switch ((enum classes) index)
 	{
 		case CC_PRINT:
-			cv = getcvec(v, UCHAR_MAX, 0);
-			if (cv)
-			{
-				for (i = 0; i <= UCHAR_MAX; i++)
-				{
-					if (pg_wc_isprint((chr) i))
-						addchr(cv, (chr) i);
-				}
-			}
+			cv = pg_ctype_get_cache(pg_wc_isprint);
 			break;
 		case CC_ALNUM:
-			cv = getcvec(v, UCHAR_MAX, 0);
-			if (cv)
-			{
-				for (i = 0; i <= UCHAR_MAX; i++)
-				{
-					if (pg_wc_isalnum((chr) i))
-						addchr(cv, (chr) i);
-				}
-			}
+			cv = pg_ctype_get_cache(pg_wc_isalnum);
 			break;
 		case CC_ALPHA:
-			cv = getcvec(v, UCHAR_MAX, 0);
-			if (cv)
-			{
-				for (i = 0; i <= UCHAR_MAX; i++)
-				{
-					if (pg_wc_isalpha((chr) i))
-						addchr(cv, (chr) i);
-				}
-			}
+			cv = pg_ctype_get_cache(pg_wc_isalpha);
 			break;
 		case CC_ASCII:
+			/* hard-wired meaning */
 			cv = getcvec(v, 0, 1);
 			if (cv)
 				addrange(cv, 0, 0x7f);
 			break;
 		case CC_BLANK:
+			/* hard-wired meaning */
 			cv = getcvec(v, 2, 0);
 			addchr(cv, '\t');
 			addchr(cv, ' ');
 			break;
 		case CC_CNTRL:
+			/* hard-wired meaning */
 			cv = getcvec(v, 0, 2);
 			addrange(cv, 0x0, 0x1f);
 			addrange(cv, 0x7f, 0x9f);
 			break;
 		case CC_DIGIT:
-			cv = getcvec(v, 0, 1);
-			if (cv)
-				addrange(cv, (chr) '0', (chr) '9');
+			cv = pg_ctype_get_cache(pg_wc_isdigit);
 			break;
 		case CC_PUNCT:
-			cv = getcvec(v, UCHAR_MAX, 0);
-			if (cv)
-			{
-				for (i = 0; i <= UCHAR_MAX; i++)
-				{
-					if (pg_wc_ispunct((chr) i))
-						addchr(cv, (chr) i);
-				}
-			}
+			cv = pg_ctype_get_cache(pg_wc_ispunct);
 			break;
 		case CC_XDIGIT:
+			/*
+			 * It's not clear how to define this in non-western locales, and
+			 * even less clear that there's any particular use in trying.
+			 * So just hard-wire the meaning.
+			 */
 			cv = getcvec(v, 0, 3);
 			if (cv)
 			{
@@ -630,50 +619,20 @@ cclass(struct vars * v,			/* context */
 			}
 			break;
 		case CC_SPACE:
-			cv = getcvec(v, UCHAR_MAX, 0);
-			if (cv)
-			{
-				for (i = 0; i <= UCHAR_MAX; i++)
-				{
-					if (pg_wc_isspace((chr) i))
-						addchr(cv, (chr) i);
-				}
-			}
+			cv = pg_ctype_get_cache(pg_wc_isspace);
 			break;
 		case CC_LOWER:
-			cv = getcvec(v, UCHAR_MAX, 0);
-			if (cv)
-			{
-				for (i = 0; i <= UCHAR_MAX; i++)
-				{
-					if (pg_wc_islower((chr) i))
-						addchr(cv, (chr) i);
-				}
-			}
+			cv = pg_ctype_get_cache(pg_wc_islower);
 			break;
 		case CC_UPPER:
-			cv = getcvec(v, UCHAR_MAX, 0);
-			if (cv)
-			{
-				for (i = 0; i <= UCHAR_MAX; i++)
-				{
-					if (pg_wc_isupper((chr) i))
-						addchr(cv, (chr) i);
-				}
-			}
+			cv = pg_ctype_get_cache(pg_wc_isupper);
 			break;
 		case CC_GRAPH:
-			cv = getcvec(v, UCHAR_MAX, 0);
-			if (cv)
-			{
-				for (i = 0; i <= UCHAR_MAX; i++)
-				{
-					if (pg_wc_isgraph((chr) i))
-						addchr(cv, (chr) i);
-				}
-			}
+			cv = pg_ctype_get_cache(pg_wc_isgraph);
 			break;
 	}
+
+	/* If cv is NULL now, the reason must be "out of memory" */
 	if (cv == NULL)
 		ERR(REG_ESPACE);
 	return cv;
author	Tom Lane <tgl@sss.pgh.pa.us>	2012-02-19 21:01:13 -0500
committer	Tom Lane <tgl@sss.pgh.pa.us>	2012-02-19 21:01:13 -0500
commit	e00f68e49c148851187136d3278b7e9afa370537 (patch)
tree	eb2902e66d525e3ce96aa60f2193bf25f6b87425 /src/backend/regex/regc_locale.c
parent	27af91438b68f46f4015853b6f75c6f5c3a8650c (diff)
download	postgresql-e00f68e49c148851187136d3278b7e9afa370537.tar.gz postgresql-e00f68e49c148851187136d3278b7e9afa370537.zip