1 files changed, 137 insertions, 7 deletions
diff --git a/src/backend/utils/adt/oracle_compat.c b/src/backend/utils/adt/oracle_compat.c
index 9fcd5ae747a..d62315d0f61 100644
--- a/src/backend/utils/adt/oracle_compat.c
+++ b/src/backend/utils/adt/oracle_compat.c
@@ -9,7 +9,7 @@
  *
  *
  * IDENTIFICATION
- *	$PostgreSQL: pgsql/src/backend/utils/adt/oracle_compat.c,v 1.70 2007/02/27 23:48:08 tgl Exp $
+ *	$PostgreSQL: pgsql/src/backend/utils/adt/oracle_compat.c,v 1.71 2007/09/18 17:41:17 adunstan Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -1246,6 +1246,13 @@ translate(PG_FUNCTION_ARGS)
  *
  *	 Returns the decimal representation of the first character from
  *	 string.
+ *   If the string is empty we return 0.
+ *   If the database encoding is UTF8, we return the Unicode codepoint. 
+ *   If the database encoding is any other multi-byte encoding, we
+ *   return the value of the first byte if it is an ASCII character
+ *   (range 1 .. 127), or raise an error.
+ *   For all other encodings we return the value of the first byte,
+ *   (range 1..255).
  *
  ********************************************************************/
 
@@ -1253,11 +1260,57 @@ Datum
 ascii(PG_FUNCTION_ARGS)
 {
 	text	   *string = PG_GETARG_TEXT_P(0);
+	int encoding = GetDatabaseEncoding();
+	unsigned char *data;
 
 	if (VARSIZE(string) <= VARHDRSZ)
 		PG_RETURN_INT32(0);
 
-	PG_RETURN_INT32((int32) *((unsigned char *) VARDATA(string)));
+	data = (unsigned char *) VARDATA(string);
+
+	if (encoding == PG_UTF8 && *data > 127)
+	{
+		/* return the code point for Unicode */
+
+		int result = 0, tbytes = 0, i;
+
+		if (*data >= 0xF0)
+		{
+			result = *data & 0x07;
+			tbytes = 3;
+		}
+		else if (*data >= 0xE0)
+		{
+			result = *data & 0x0F;
+			tbytes = 2;
+		}
+		else
+		{
+			Assert (*data > 0xC0);
+			result = *data & 0x1f;
+			tbytes = 1;
+		}
+
+		Assert (tbytes > 0);
+
+		for (i = 1; i <= tbytes; i++)
+		{
+			Assert ((data[i] & 0xC0) == 0x80);
+			result = (result << 6) + (data[i] & 0x3f);
+		}
+
+		PG_RETURN_INT32(result);
+	}
+	else
+	{
+		if (pg_encoding_max_length(encoding) > 1 && *data > 127)
+			ereport(ERROR,
+					(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+					 errmsg("requested character too large")));
+
+
+		PG_RETURN_INT32((int32) *data);
+	}
 }
 
 /********************************************************************
@@ -1270,19 +1323,96 @@ ascii(PG_FUNCTION_ARGS)
  *
  * Purpose:
  *
- *	Returns the character having the binary equivalent to val
+ *	Returns the character having the binary equivalent to val.
+ *
+ * For UTF8 we treat the argumwent as a Unicode code point.
+ * For other multi-byte encodings we raise an error for arguments
+ * outside the strict ASCII range (1..127).
+ *
+ * It's important that we don't ever return a value that is not valid
+ * in the database encoding, so that this doesn't become a way for
+ * invalid data to enter the database.
  *
  ********************************************************************/
 
 Datum
 chr(PG_FUNCTION_ARGS)
 {
-	int32		cvalue = PG_GETARG_INT32(0);
+	uint32		cvalue = PG_GETARG_UINT32(0);
 	text	   *result;
+	int encoding = GetDatabaseEncoding();
+
+	if (encoding == PG_UTF8 && cvalue > 127)
+	{
+		/* for Unicode we treat the argument as a code point */
+		int bytes ;
+		char *wch;
 
-	result = (text *) palloc(VARHDRSZ + 1);
-	SET_VARSIZE(result, VARHDRSZ + 1);
-	*VARDATA(result) = (char) cvalue;
+		/* We only allow valid Unicode code points */
+		if (cvalue > 0x001fffff)
+			ereport(ERROR,
+					(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+					 errmsg("requested character too large for encoding: %d", 
+							cvalue)));
+
+		if (cvalue > 0xffff)
+			bytes = 4;
+		else if (cvalue > 0x07ff)
+			bytes = 3;
+		else
+			bytes = 2;
+
+		result = (text *) palloc(VARHDRSZ + bytes);
+		SET_VARSIZE(result, VARHDRSZ + bytes);
+		wch = VARDATA(result);
+
+		if (bytes == 2)
+		{
+			wch[0] = 0xC0 | ((cvalue >> 6) & 0x1F);
+			wch[1] = 0x80 | (cvalue & 0x3F);;
+		}
+		else if (bytes == 3)
+		{
+			wch[0] = 0xE0 | ((cvalue >> 12) & 0x0F);
+			wch[1] = 0x80 | ((cvalue >> 6) & 0x3F);
+			wch[2] = 0x80 | (cvalue & 0x3F);
+		}
+		else
+		{
+			wch[0] = 0xF0 | ((cvalue >> 18) & 0x07);
+			wch[1] = 0x80 | ((cvalue >> 12) & 0x3F);
+			wch[2] = 0x80 | ((cvalue >> 6) & 0x3F);
+			wch[3] = 0x80 | (cvalue & 0x3F);
+		}
+		
+	}
+
+	else
+	{
+		bool is_mb;
+
+		/* Error out on arguments that make no sense or that we
+		 * can't validly represent in the encoding.
+		 */
+
+		if (cvalue == 0)
+			ereport(ERROR,
+					(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+					 errmsg("null character not permitted")));
+
+		is_mb = pg_encoding_max_length(encoding) > 1;
+
+		if ((is_mb && (cvalue > 255)) || (! is_mb && (cvalue > 127)))
+			ereport(ERROR,
+					(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+					 errmsg("requested character too large for encoding: %d",
+							cvalue)));
+		
+
+		result = (text *) palloc(VARHDRSZ + 1);
+		SET_VARSIZE(result, VARHDRSZ + 1);
+		*VARDATA(result) = (char) cvalue;
+	}
 
 	PG_RETURN_TEXT_P(result);
 }