Use wide-character library routines, if available, for upper/lower/initcap

functions. This allows these functions to work correctly with Unicode and other multibyte encodings. Per prior discussion. Also, revert my earlier change to move installation path mashing from Makefile.global to configure. Turns out not to work well because configure script is working with unexpanded variables, and so fails to match in cases where it should match.
author: Tom Lane <tgl@sss.pgh.pa.us> 2004-05-22 00:34:51 +0000
committer: Tom Lane <tgl@sss.pgh.pa.us> 2004-05-22 00:34:51 +0000
commit: 398386943936fbf66291ae3c6ed307779e4b90e4 (patch)
tree: af0523da83c2fe61a30db6de515a6e544051eed3 /src/backend/utils/adt/oracle_compat.c
parent: add8b70dda0439c8b3a89440576cbfc19989ed39 (diff)
download: postgresql-398386943936fbf66291ae3c6ed307779e4b90e4.tar.gz
postgresql-398386943936fbf66291ae3c6ed307779e4b90e4.zip
1 files changed, 189 insertions, 1 deletions
diff --git a/src/backend/utils/adt/oracle_compat.c b/src/backend/utils/adt/oracle_compat.c
index eca71de6fc6..7f381438ed2 100644
--- a/src/backend/utils/adt/oracle_compat.c
+++ b/src/backend/utils/adt/oracle_compat.c
@@ -9,23 +9,144 @@
  *
  *
  * IDENTIFICATION
- *	$PostgreSQL: pgsql/src/backend/utils/adt/oracle_compat.c,v 1.50 2004/02/27 03:59:23 neilc Exp $
+ *	$PostgreSQL: pgsql/src/backend/utils/adt/oracle_compat.c,v 1.51 2004/05/22 00:34:50 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
 #include "postgres.h"
 
+/*
+ * towlower() and friends should be in <wctype.h>, but some pre-C99 systems
+ * declare them in <wchar.h>.
+ */
 #include <ctype.h>
+#ifdef HAVE_WCHAR_H
+#include <wchar.h>
+#endif
+#ifdef HAVE_WCTYPE_H
+#include <wctype.h>
+#endif
 
 #include "utils/builtins.h"
 #include "mb/pg_wchar.h"
 
 
+/*
+ * If the system provides the needed functions for wide-character manipulation
+ * (which are all standardized by C99), then we implement upper/lower/initcap
+ * using wide-character functions.  Otherwise we use the traditional <ctype.h>
+ * functions, which of course will not work as desired in multibyte character
+ * sets.  Note that in either case we are effectively assuming that the
+ * database character encoding matches the encoding implied by LC_CTYPE.
+ *
+ * We assume if we have these two functions, we have their friends too, and
+ * can use the wide-character method.
+ */
+#if defined(HAVE_WCSTOMBS) && defined(HAVE_TOWLOWER)
+#define USE_WIDE_UPPER_LOWER
+#endif
+
 static text *dotrim(const char *string, int stringlen,
 	   const char *set, int setlen,
 	   bool doltrim, bool dortrim);
 
 
+#ifdef USE_WIDE_UPPER_LOWER
+
+/*
+ * Convert a TEXT value into a palloc'd wchar string.
+ */
+static wchar_t *
+texttowcs(const text *txt)
+{
+	int			nbytes = VARSIZE(txt) - VARHDRSZ;
+	char	   *workstr;
+	wchar_t	   *result;
+	size_t		ncodes;
+
+	/* Overflow paranoia */
+	if (nbytes < 0 ||
+		nbytes > (int) (INT_MAX / sizeof(wchar_t)) - 1)
+		ereport(ERROR,
+				(errcode(ERRCODE_OUT_OF_MEMORY),
+				 errmsg("out of memory")));
+
+	/* Need a null-terminated version of the input */
+	workstr = (char *) palloc(nbytes + 1);
+	memcpy(workstr, VARDATA(txt), nbytes);
+	workstr[nbytes] = '\0';
+
+	/* Output workspace cannot have more codes than input bytes */
+	result = (wchar_t *) palloc((nbytes + 1) * sizeof(wchar_t));
+
+	/* Do the conversion */
+	ncodes = mbstowcs(result, workstr, nbytes + 1);
+
+	if (ncodes == (size_t) -1)
+	{
+		/*
+		 * Invalid multibyte character encountered.  We try to give a useful
+		 * error message by letting pg_verifymbstr check the string.  But
+		 * it's possible that the string is OK to us, and not OK to mbstowcs
+		 * --- this suggests that the LC_CTYPE locale is different from the
+		 * database encoding.  Give a generic error message if verifymbstr
+		 * can't find anything wrong.
+		 */
+		pg_verifymbstr(workstr, nbytes, false);
+		ereport(ERROR,
+				(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
+				 errmsg("invalid multibyte character for locale")));
+	}
+
+	Assert(ncodes <= (size_t) nbytes);
+
+	return result;
+}
+
+
+/*
+ * Convert a wchar string into a palloc'd TEXT value.  The wchar string
+ * must be zero-terminated, but we also require the caller to pass the string
+ * length, since it will know it anyway in current uses.
+ */
+static text *
+wcstotext(const wchar_t *str, int ncodes)
+{
+	text	   *result;
+	size_t		nbytes;
+
+	/* Overflow paranoia */
+	if (ncodes < 0 ||
+		ncodes > (int) ((INT_MAX - VARHDRSZ) / MB_CUR_MAX) - 1)
+		ereport(ERROR,
+				(errcode(ERRCODE_OUT_OF_MEMORY),
+				 errmsg("out of memory")));
+
+	/* Make workspace certainly large enough for result */
+	result = (text *) palloc((ncodes + 1) * MB_CUR_MAX + VARHDRSZ);
+
+	/* Do the conversion */
+	nbytes = wcstombs((char *) VARDATA(result), str,
+					  (ncodes + 1) * MB_CUR_MAX);
+
+	if (nbytes == (size_t) -1)
+	{
+		/* Invalid multibyte character encountered ... shouldn't happen */
+		ereport(ERROR,
+				(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
+				 errmsg("invalid multibyte character for locale")));
+	}
+
+	Assert(nbytes <= (size_t) (ncodes * MB_CUR_MAX));
+
+	VARATT_SIZEP(result) = nbytes + VARHDRSZ;
+
+	return result;
+}
+
+#endif /* USE_WIDE_UPPER_LOWER */
+
+
 /********************************************************************
  *
  * lower
@@ -43,6 +164,25 @@ static text *dotrim(const char *string, int stringlen,
 Datum
 lower(PG_FUNCTION_ARGS)
 {
+#ifdef USE_WIDE_UPPER_LOWER
+	text	   *string = PG_GETARG_TEXT_P(0);
+	text	   *result;
+	wchar_t	   *workspace;
+	int			i;
+
+	workspace = texttowcs(string);
+
+	for (i = 0; workspace[i] != 0; i++)
+		workspace[i] = towlower(workspace[i]);
+
+	result = wcstotext(workspace, i);
+
+	pfree(workspace);
+
+	PG_RETURN_TEXT_P(result);
+
+#else /* !USE_WIDE_UPPER_LOWER */
+
 	text	   *string = PG_GETARG_TEXT_P_COPY(0);
 	char	   *ptr;
 	int			m;
@@ -58,6 +198,7 @@ lower(PG_FUNCTION_ARGS)
 	}
 
 	PG_RETURN_TEXT_P(string);
+#endif /* USE_WIDE_UPPER_LOWER */
 }
 
 
@@ -78,6 +219,25 @@ lower(PG_FUNCTION_ARGS)
 Datum
 upper(PG_FUNCTION_ARGS)
 {
+#ifdef USE_WIDE_UPPER_LOWER
+	text	   *string = PG_GETARG_TEXT_P(0);
+	text	   *result;
+	wchar_t	   *workspace;
+	int			i;
+
+	workspace = texttowcs(string);
+
+	for (i = 0; workspace[i] != 0; i++)
+		workspace[i] = towupper(workspace[i]);
+
+	result = wcstotext(workspace, i);
+
+	pfree(workspace);
+
+	PG_RETURN_TEXT_P(result);
+
+#else /* !USE_WIDE_UPPER_LOWER */
+
 	text	   *string = PG_GETARG_TEXT_P_COPY(0);
 	char	   *ptr;
 	int			m;
@@ -93,6 +253,7 @@ upper(PG_FUNCTION_ARGS)
 	}
 
 	PG_RETURN_TEXT_P(string);
+#endif /* USE_WIDE_UPPER_LOWER */
 }
 
 
@@ -116,6 +277,32 @@ upper(PG_FUNCTION_ARGS)
 Datum
 initcap(PG_FUNCTION_ARGS)
 {
+#ifdef USE_WIDE_UPPER_LOWER
+	text	   *string = PG_GETARG_TEXT_P(0);
+	text	   *result;
+	wchar_t	   *workspace;
+	int			wasalnum = 0;
+	int			i;
+
+	workspace = texttowcs(string);
+
+	for (i = 0; workspace[i] != 0; i++)
+	{
+		if (wasalnum)
+			workspace[i] = towlower(workspace[i]);
+		else
+			workspace[i] = towupper(workspace[i]);
+		wasalnum = iswalnum(workspace[i]);
+	}
+
+	result = wcstotext(workspace, i);
+
+	pfree(workspace);
+
+	PG_RETURN_TEXT_P(result);
+
+#else /* !USE_WIDE_UPPER_LOWER */
+
 	text	   *string = PG_GETARG_TEXT_P_COPY(0);
 	char	   *ptr;
 	int			m;
@@ -142,6 +329,7 @@ initcap(PG_FUNCTION_ARGS)
 	}
 
 	PG_RETURN_TEXT_P(string);
+#endif /* USE_WIDE_UPPER_LOWER */
 }
author	Tom Lane <tgl@sss.pgh.pa.us>	2004-05-22 00:34:51 +0000
committer	Tom Lane <tgl@sss.pgh.pa.us>	2004-05-22 00:34:51 +0000
commit	398386943936fbf66291ae3c6ed307779e4b90e4 (patch)
tree	af0523da83c2fe61a30db6de515a6e544051eed3 /src/backend/utils/adt/oracle_compat.c
parent	add8b70dda0439c8b3a89440576cbfc19989ed39 (diff)
download	postgresql-398386943936fbf66291ae3c6ed307779e4b90e4.tar.gz postgresql-398386943936fbf66291ae3c6ed307779e4b90e4.zip