diff options
author | Andrew Dunstan <andrew@dunslane.net> | 2007-09-18 17:41:17 +0000 |
---|---|---|
committer | Andrew Dunstan <andrew@dunslane.net> | 2007-09-18 17:41:17 +0000 |
commit | 55613bf9cd7d6071e43e68ac14bc0243a1027507 (patch) | |
tree | 9b151f94d94e7dc3aa5988c03867d3f6f6b562ba /src/backend/utils/adt/oracle_compat.c | |
parent | 8544110042ddf8be29e177e37f53516686a06da2 (diff) | |
download | postgresql-55613bf9cd7d6071e43e68ac14bc0243a1027507.tar.gz postgresql-55613bf9cd7d6071e43e68ac14bc0243a1027507.zip |
Close previously open holes for invalidly encoded data to enter the
database via builtin functions, as recently discussed on -hackers.
chr() now returns a character in the database encoding. For UTF8 encoded databases
the argument is treated as a Unicode code point. For other multi-byte encodings
the argument must designate a strict ascii character, or an error is raised,
as is also the case if the argument is 0.
ascii() is adjusted so that it remains the inverse of chr().
The two argument form of convert() is gone, and the three argument form now
takes a bytea first argument and returns a bytea. To cover this loss three new
functions are introduced:
. convert_from(bytea, name) returns text - converts the first argument from the
named encoding to the database encoding
. convert_to(text, name) returns bytea - converts the first argument from the
database encoding to the named encoding
. length(bytea, name) returns int - gives the length of the first argument in
characters in the named encoding
Diffstat (limited to 'src/backend/utils/adt/oracle_compat.c')
-rw-r--r-- | src/backend/utils/adt/oracle_compat.c | 144 |
1 files changed, 137 insertions, 7 deletions
diff --git a/src/backend/utils/adt/oracle_compat.c b/src/backend/utils/adt/oracle_compat.c index 9fcd5ae747a..d62315d0f61 100644 --- a/src/backend/utils/adt/oracle_compat.c +++ b/src/backend/utils/adt/oracle_compat.c @@ -9,7 +9,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/utils/adt/oracle_compat.c,v 1.70 2007/02/27 23:48:08 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/utils/adt/oracle_compat.c,v 1.71 2007/09/18 17:41:17 adunstan Exp $ * *------------------------------------------------------------------------- */ @@ -1246,6 +1246,13 @@ translate(PG_FUNCTION_ARGS) * * Returns the decimal representation of the first character from * string. + * If the string is empty we return 0. + * If the database encoding is UTF8, we return the Unicode codepoint. + * If the database encoding is any other multi-byte encoding, we + * return the value of the first byte if it is an ASCII character + * (range 1 .. 127), or raise an error. + * For all other encodings we return the value of the first byte, + * (range 1..255). * ********************************************************************/ @@ -1253,11 +1260,57 @@ Datum ascii(PG_FUNCTION_ARGS) { text *string = PG_GETARG_TEXT_P(0); + int encoding = GetDatabaseEncoding(); + unsigned char *data; if (VARSIZE(string) <= VARHDRSZ) PG_RETURN_INT32(0); - PG_RETURN_INT32((int32) *((unsigned char *) VARDATA(string))); + data = (unsigned char *) VARDATA(string); + + if (encoding == PG_UTF8 && *data > 127) + { + /* return the code point for Unicode */ + + int result = 0, tbytes = 0, i; + + if (*data >= 0xF0) + { + result = *data & 0x07; + tbytes = 3; + } + else if (*data >= 0xE0) + { + result = *data & 0x0F; + tbytes = 2; + } + else + { + Assert (*data > 0xC0); + result = *data & 0x1f; + tbytes = 1; + } + + Assert (tbytes > 0); + + for (i = 1; i <= tbytes; i++) + { + Assert ((data[i] & 0xC0) == 0x80); + result = (result << 6) + (data[i] & 0x3f); + } + + PG_RETURN_INT32(result); + } + else + { + if (pg_encoding_max_length(encoding) > 1 && *data > 127) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("requested character too large"))); + + + PG_RETURN_INT32((int32) *data); + } } /******************************************************************** @@ -1270,19 +1323,96 @@ ascii(PG_FUNCTION_ARGS) * * Purpose: * - * Returns the character having the binary equivalent to val + * Returns the character having the binary equivalent to val. + * + * For UTF8 we treat the argumwent as a Unicode code point. + * For other multi-byte encodings we raise an error for arguments + * outside the strict ASCII range (1..127). + * + * It's important that we don't ever return a value that is not valid + * in the database encoding, so that this doesn't become a way for + * invalid data to enter the database. * ********************************************************************/ Datum chr(PG_FUNCTION_ARGS) { - int32 cvalue = PG_GETARG_INT32(0); + uint32 cvalue = PG_GETARG_UINT32(0); text *result; + int encoding = GetDatabaseEncoding(); + + if (encoding == PG_UTF8 && cvalue > 127) + { + /* for Unicode we treat the argument as a code point */ + int bytes ; + char *wch; - result = (text *) palloc(VARHDRSZ + 1); - SET_VARSIZE(result, VARHDRSZ + 1); - *VARDATA(result) = (char) cvalue; + /* We only allow valid Unicode code points */ + if (cvalue > 0x001fffff) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("requested character too large for encoding: %d", + cvalue))); + + if (cvalue > 0xffff) + bytes = 4; + else if (cvalue > 0x07ff) + bytes = 3; + else + bytes = 2; + + result = (text *) palloc(VARHDRSZ + bytes); + SET_VARSIZE(result, VARHDRSZ + bytes); + wch = VARDATA(result); + + if (bytes == 2) + { + wch[0] = 0xC0 | ((cvalue >> 6) & 0x1F); + wch[1] = 0x80 | (cvalue & 0x3F);; + } + else if (bytes == 3) + { + wch[0] = 0xE0 | ((cvalue >> 12) & 0x0F); + wch[1] = 0x80 | ((cvalue >> 6) & 0x3F); + wch[2] = 0x80 | (cvalue & 0x3F); + } + else + { + wch[0] = 0xF0 | ((cvalue >> 18) & 0x07); + wch[1] = 0x80 | ((cvalue >> 12) & 0x3F); + wch[2] = 0x80 | ((cvalue >> 6) & 0x3F); + wch[3] = 0x80 | (cvalue & 0x3F); + } + + } + + else + { + bool is_mb; + + /* Error out on arguments that make no sense or that we + * can't validly represent in the encoding. + */ + + if (cvalue == 0) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("null character not permitted"))); + + is_mb = pg_encoding_max_length(encoding) > 1; + + if ((is_mb && (cvalue > 255)) || (! is_mb && (cvalue > 127))) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("requested character too large for encoding: %d", + cvalue))); + + + result = (text *) palloc(VARHDRSZ + 1); + SET_VARSIZE(result, VARHDRSZ + 1); + *VARDATA(result) = (char) cvalue; + } PG_RETURN_TEXT_P(result); } |