diff options
Diffstat (limited to 'src')
-rw-r--r-- | src/backend/catalog/pg_conversion.c | 9 | ||||
-rw-r--r-- | src/backend/utils/adt/oracle_compat.c | 144 | ||||
-rw-r--r-- | src/backend/utils/mb/mbutils.c | 74 | ||||
-rw-r--r-- | src/backend/utils/mb/wchar.c | 42 | ||||
-rw-r--r-- | src/include/catalog/catversion.h | 4 | ||||
-rw-r--r-- | src/include/catalog/pg_proc.h | 14 | ||||
-rw-r--r-- | src/include/mb/pg_wchar.h | 4 | ||||
-rw-r--r-- | src/include/utils/builtins.h | 6 |
8 files changed, 250 insertions, 47 deletions
diff --git a/src/backend/catalog/pg_conversion.c b/src/backend/catalog/pg_conversion.c index 262d9f41fb7..7146d0b4f53 100644 --- a/src/backend/catalog/pg_conversion.c +++ b/src/backend/catalog/pg_conversion.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/catalog/pg_conversion.c,v 1.36 2007/02/27 23:48:07 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/catalog/pg_conversion.c,v 1.37 2007/09/18 17:41:17 adunstan Exp $ * *------------------------------------------------------------------------- */ @@ -282,7 +282,10 @@ FindConversion(const char *conname, Oid connamespace) * CONVERT <left paren> <character value expression> * USING <form-of-use conversion name> <right paren> * - * TEXT convert_using(TEXT string, TEXT conversion_name) + * BYTEA convert_using(TEXT string, TEXT conversion_name) + * + * bytea is returned so we don't give a value that is + * not valid in the database encoding. */ Datum pg_convert_using(PG_FUNCTION_ARGS) @@ -344,5 +347,5 @@ pg_convert_using(PG_FUNCTION_ARGS) pfree(result); pfree(str); - PG_RETURN_TEXT_P(retval); + PG_RETURN_BYTEA_P(retval); } diff --git a/src/backend/utils/adt/oracle_compat.c b/src/backend/utils/adt/oracle_compat.c index 9fcd5ae747a..d62315d0f61 100644 --- a/src/backend/utils/adt/oracle_compat.c +++ b/src/backend/utils/adt/oracle_compat.c @@ -9,7 +9,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/utils/adt/oracle_compat.c,v 1.70 2007/02/27 23:48:08 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/utils/adt/oracle_compat.c,v 1.71 2007/09/18 17:41:17 adunstan Exp $ * *------------------------------------------------------------------------- */ @@ -1246,6 +1246,13 @@ translate(PG_FUNCTION_ARGS) * * Returns the decimal representation of the first character from * string. + * If the string is empty we return 0. + * If the database encoding is UTF8, we return the Unicode codepoint. + * If the database encoding is any other multi-byte encoding, we + * return the value of the first byte if it is an ASCII character + * (range 1 .. 127), or raise an error. + * For all other encodings we return the value of the first byte, + * (range 1..255). * ********************************************************************/ @@ -1253,11 +1260,57 @@ Datum ascii(PG_FUNCTION_ARGS) { text *string = PG_GETARG_TEXT_P(0); + int encoding = GetDatabaseEncoding(); + unsigned char *data; if (VARSIZE(string) <= VARHDRSZ) PG_RETURN_INT32(0); - PG_RETURN_INT32((int32) *((unsigned char *) VARDATA(string))); + data = (unsigned char *) VARDATA(string); + + if (encoding == PG_UTF8 && *data > 127) + { + /* return the code point for Unicode */ + + int result = 0, tbytes = 0, i; + + if (*data >= 0xF0) + { + result = *data & 0x07; + tbytes = 3; + } + else if (*data >= 0xE0) + { + result = *data & 0x0F; + tbytes = 2; + } + else + { + Assert (*data > 0xC0); + result = *data & 0x1f; + tbytes = 1; + } + + Assert (tbytes > 0); + + for (i = 1; i <= tbytes; i++) + { + Assert ((data[i] & 0xC0) == 0x80); + result = (result << 6) + (data[i] & 0x3f); + } + + PG_RETURN_INT32(result); + } + else + { + if (pg_encoding_max_length(encoding) > 1 && *data > 127) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("requested character too large"))); + + + PG_RETURN_INT32((int32) *data); + } } /******************************************************************** @@ -1270,19 +1323,96 @@ ascii(PG_FUNCTION_ARGS) * * Purpose: * - * Returns the character having the binary equivalent to val + * Returns the character having the binary equivalent to val. + * + * For UTF8 we treat the argumwent as a Unicode code point. + * For other multi-byte encodings we raise an error for arguments + * outside the strict ASCII range (1..127). + * + * It's important that we don't ever return a value that is not valid + * in the database encoding, so that this doesn't become a way for + * invalid data to enter the database. * ********************************************************************/ Datum chr(PG_FUNCTION_ARGS) { - int32 cvalue = PG_GETARG_INT32(0); + uint32 cvalue = PG_GETARG_UINT32(0); text *result; + int encoding = GetDatabaseEncoding(); + + if (encoding == PG_UTF8 && cvalue > 127) + { + /* for Unicode we treat the argument as a code point */ + int bytes ; + char *wch; - result = (text *) palloc(VARHDRSZ + 1); - SET_VARSIZE(result, VARHDRSZ + 1); - *VARDATA(result) = (char) cvalue; + /* We only allow valid Unicode code points */ + if (cvalue > 0x001fffff) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("requested character too large for encoding: %d", + cvalue))); + + if (cvalue > 0xffff) + bytes = 4; + else if (cvalue > 0x07ff) + bytes = 3; + else + bytes = 2; + + result = (text *) palloc(VARHDRSZ + bytes); + SET_VARSIZE(result, VARHDRSZ + bytes); + wch = VARDATA(result); + + if (bytes == 2) + { + wch[0] = 0xC0 | ((cvalue >> 6) & 0x1F); + wch[1] = 0x80 | (cvalue & 0x3F);; + } + else if (bytes == 3) + { + wch[0] = 0xE0 | ((cvalue >> 12) & 0x0F); + wch[1] = 0x80 | ((cvalue >> 6) & 0x3F); + wch[2] = 0x80 | (cvalue & 0x3F); + } + else + { + wch[0] = 0xF0 | ((cvalue >> 18) & 0x07); + wch[1] = 0x80 | ((cvalue >> 12) & 0x3F); + wch[2] = 0x80 | ((cvalue >> 6) & 0x3F); + wch[3] = 0x80 | (cvalue & 0x3F); + } + + } + + else + { + bool is_mb; + + /* Error out on arguments that make no sense or that we + * can't validly represent in the encoding. + */ + + if (cvalue == 0) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("null character not permitted"))); + + is_mb = pg_encoding_max_length(encoding) > 1; + + if ((is_mb && (cvalue > 255)) || (! is_mb && (cvalue > 127))) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("requested character too large for encoding: %d", + cvalue))); + + + result = (text *) palloc(VARHDRSZ + 1); + SET_VARSIZE(result, VARHDRSZ + 1); + *VARDATA(result) = (char) cvalue; + } PG_RETURN_TEXT_P(result); } diff --git a/src/backend/utils/mb/mbutils.c b/src/backend/utils/mb/mbutils.c index a466073ca0a..e3ffd370e81 100644 --- a/src/backend/utils/mb/mbutils.c +++ b/src/backend/utils/mb/mbutils.c @@ -4,7 +4,7 @@ * (currently mule internal code (mic) is used) * Tatsuo Ishii * - * $PostgreSQL: pgsql/src/backend/utils/mb/mbutils.c,v 1.63 2007/05/28 16:43:24 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/utils/mb/mbutils.c,v 1.64 2007/09/18 17:41:17 adunstan Exp $ */ #include "postgres.h" @@ -292,12 +292,12 @@ pg_do_encoding_conversion(unsigned char *src, int len, } /* - * Convert string using encoding_nanme. We assume that string's - * encoding is same as DB encoding. + * Convert string using encoding_name. The source + * encoding is the DB encoding. * - * TEXT convert(TEXT string, NAME encoding_name) */ + * BYTEA convert_to(TEXT string, NAME encoding_name) */ Datum -pg_convert(PG_FUNCTION_ARGS) +pg_convert_to(PG_FUNCTION_ARGS) { Datum string = PG_GETARG_DATUM(0); Datum dest_encoding_name = PG_GETARG_DATUM(1); @@ -306,7 +306,30 @@ pg_convert(PG_FUNCTION_ARGS) Datum result; result = DirectFunctionCall3( - pg_convert2, string, src_encoding_name, dest_encoding_name); + pg_convert, string, src_encoding_name, dest_encoding_name); + + /* free memory allocated by namein */ + pfree((void *) src_encoding_name); + + PG_RETURN_BYTEA_P(result); +} + +/* + * Convert string using encoding_name. The destination + * encoding is the DB encoding. + * + * TEXT convert_from(BYTEA string, NAME encoding_name) */ +Datum +pg_convert_from(PG_FUNCTION_ARGS) +{ + Datum string = PG_GETARG_DATUM(0); + Datum src_encoding_name = PG_GETARG_DATUM(1); + Datum dest_encoding_name = DirectFunctionCall1( + namein, CStringGetDatum(DatabaseEncoding->name)); + Datum result; + + result = DirectFunctionCall3( + pg_convert, string, src_encoding_name, dest_encoding_name); /* free memory allocated by namein */ pfree((void *) src_encoding_name); @@ -315,20 +338,20 @@ pg_convert(PG_FUNCTION_ARGS) } /* - * Convert string using encoding_name. + * Convert string using encoding_names. * - * TEXT convert2(TEXT string, NAME src_encoding_name, NAME dest_encoding_name) + * BYTEA convert(BYTEA string, NAME src_encoding_name, NAME dest_encoding_name) */ Datum -pg_convert2(PG_FUNCTION_ARGS) +pg_convert(PG_FUNCTION_ARGS) { - text *string = PG_GETARG_TEXT_P(0); + bytea *string = PG_GETARG_TEXT_P(0); char *src_encoding_name = NameStr(*PG_GETARG_NAME(1)); int src_encoding = pg_char_to_encoding(src_encoding_name); char *dest_encoding_name = NameStr(*PG_GETARG_NAME(2)); int dest_encoding = pg_char_to_encoding(dest_encoding_name); unsigned char *result; - text *retval; + bytea *retval; unsigned char *str; int len; @@ -343,8 +366,9 @@ pg_convert2(PG_FUNCTION_ARGS) errmsg("invalid destination encoding name \"%s\"", dest_encoding_name))); - /* make sure that source string is null terminated */ + /* make sure that source string is valid and null terminated */ len = VARSIZE(string) - VARHDRSZ; + pg_verify_mbstr(src_encoding,VARDATA(string),len,false); str = palloc(len + 1); memcpy(str, VARDATA(string), len); *(str + len) = '\0'; @@ -354,8 +378,7 @@ pg_convert2(PG_FUNCTION_ARGS) elog(ERROR, "encoding conversion failed"); /* - * build text data type structure. we cannot use textin() here, since - * textin assumes that input string encoding is same as database encoding. + * build bytea data type structure. */ len = strlen((char *) result) + VARHDRSZ; retval = palloc(len); @@ -369,7 +392,28 @@ pg_convert2(PG_FUNCTION_ARGS) /* free memory if allocated by the toaster */ PG_FREE_IF_COPY(string, 0); - PG_RETURN_TEXT_P(retval); + PG_RETURN_BYTEA_P(retval); +} + +/* + * get the length of the string considered as text in the specified + * encoding. Raises an error if the data is not valid in that + * encoding. + * + * INT4 length (BYTEA string, NAME src_encoding_name) + */ +Datum +length_in_encoding(PG_FUNCTION_ARGS) +{ + bytea *string = PG_GETARG_BYTEA_P(0); + char *src_encoding_name = NameStr(*PG_GETARG_NAME(1)); + int src_encoding = pg_char_to_encoding(src_encoding_name); + int len = VARSIZE(string) - VARHDRSZ; + int retval; + + retval = pg_verify_mbstr_len(src_encoding, VARDATA(string), len, false); + PG_RETURN_INT32(retval); + } /* diff --git a/src/backend/utils/mb/wchar.c b/src/backend/utils/mb/wchar.c index cc8d4b58624..2c98f4b476e 100644 --- a/src/backend/utils/mb/wchar.c +++ b/src/backend/utils/mb/wchar.c @@ -1,9 +1,7 @@ /* * conversion functions between pg_wchar and multibyte streams. * Tatsuo Ishii - * $PostgreSQL: pgsql/src/backend/utils/mb/wchar.c,v 1.63 2007/07/12 21:17:09 tgl Exp $ - * - * WIN1250 client encoding updated by Pavel Behal + * $PostgreSQL: pgsql/src/backend/utils/mb/wchar.c,v 1.64 2007/09/18 17:41:17 adunstan Exp $ * */ /* can be used in either frontend or backend */ @@ -1435,23 +1433,37 @@ pg_database_encoding_max_length(void) bool pg_verifymbstr(const char *mbstr, int len, bool noError) { - return pg_verify_mbstr(GetDatabaseEncoding(), mbstr, len, noError); + return + pg_verify_mbstr_len(GetDatabaseEncoding(), mbstr, len, noError) >= 0; } /* * Verify mbstr to make sure that it is validly encoded in the specified * encoding. * + */ +bool +pg_verify_mbstr(int encoding, const char *mbstr, int len, bool noError) +{ + return pg_verify_mbstr_len(encoding, mbstr, len, noError) >= 0; +} + +/* + * Verify mbstr to make sure that it is validly encoded in the specified + * encoding. + * * mbstr is not necessarily zero terminated; length of mbstr is * specified by len. * - * If OK, return TRUE. If a problem is found, return FALSE when noError is + * If OK, return length of string in the encoding. + * If a problem is found, return -1 when noError is * true; when noError is false, ereport() a descriptive message. - */ -bool -pg_verify_mbstr(int encoding, const char *mbstr, int len, bool noError) + */ +int +pg_verify_mbstr_len(int encoding, const char *mbstr, int len, bool noError) { mbverifier mbverify; + int mb_len; Assert(PG_VALID_ENCODING(encoding)); @@ -1463,14 +1475,16 @@ pg_verify_mbstr(int encoding, const char *mbstr, int len, bool noError) const char *nullpos = memchr(mbstr, 0, len); if (nullpos == NULL) - return true; + return len; if (noError) - return false; + return -1; report_invalid_encoding(encoding, nullpos, 1); } /* fetch function pointer just once */ mbverify = pg_wchar_table[encoding].mbverify; + + mb_len = 0; while (len > 0) { @@ -1481,12 +1495,13 @@ pg_verify_mbstr(int encoding, const char *mbstr, int len, bool noError) { if (*mbstr != '\0') { + mb_len++; mbstr++; len--; continue; } if (noError) - return false; + return -1; report_invalid_encoding(encoding, mbstr, len); } @@ -1495,14 +1510,15 @@ pg_verify_mbstr(int encoding, const char *mbstr, int len, bool noError) if (l < 0) { if (noError) - return false; + return -1; report_invalid_encoding(encoding, mbstr, len); } mbstr += l; len -= l; + mb_len++; } - return true; + return mb_len; } /* diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h index d2301179b6a..e21606a1259 100644 --- a/src/include/catalog/catversion.h +++ b/src/include/catalog/catversion.h @@ -37,7 +37,7 @@ * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/catalog/catversion.h,v 1.424 2007/09/11 03:28:05 tgl Exp $ + * $PostgreSQL: pgsql/src/include/catalog/catversion.h,v 1.425 2007/09/18 17:41:17 adunstan Exp $ * *------------------------------------------------------------------------- */ @@ -53,6 +53,6 @@ */ /* yyyymmddN */ -#define CATALOG_VERSION_NO 200709101 +#define CATALOG_VERSION_NO 200709181 #endif diff --git a/src/include/catalog/pg_proc.h b/src/include/catalog/pg_proc.h index 8ee98cb4df6..5f534839b23 100644 --- a/src/include/catalog/pg_proc.h +++ b/src/include/catalog/pg_proc.h @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/catalog/pg_proc.h,v 1.469 2007/09/11 03:28:05 tgl Exp $ + * $PostgreSQL: pgsql/src/include/catalog/pg_proc.h,v 1.470 2007/09/18 17:41:17 adunstan Exp $ * * NOTES * The script catalog/genbki.sh reads this file and generates .bki @@ -2232,13 +2232,19 @@ DESCR("encoding name of current database"); DATA(insert OID = 810 ( pg_client_encoding PGNSP PGUID 12 1 0 f f t f s 0 19 "" _null_ _null_ _null_ pg_client_encoding - _null_ _null_ )); DESCR("encoding name of current database"); -DATA(insert OID = 1717 ( convert PGNSP PGUID 12 1 0 f f t f s 2 25 "25 19" _null_ _null_ _null_ pg_convert - _null_ _null_ )); +DATA(insert OID = 1713 ( length PGNSP PGUID 12 1 0 f f t f s 2 23 "17 19" _null_ _null_ _null_ length_in_encoding - _null_ _null_ )); +DESCR("length of string in specified encoding"); + +DATA(insert OID = 1714 ( convert_from PGNSP PGUID 12 1 0 f f t f s 2 25 "17 19" _null_ _null_ _null_ pg_convert_from - _null_ _null_ )); +DESCR("convert string with specified source encoding name"); + +DATA(insert OID = 1717 ( convert_to PGNSP PGUID 12 1 0 f f t f s 2 17 "25 19" _null_ _null_ _null_ pg_convert_to - _null_ _null_ )); DESCR("convert string with specified destination encoding name"); -DATA(insert OID = 1813 ( convert PGNSP PGUID 12 1 0 f f t f s 3 25 "25 19 19" _null_ _null_ _null_ pg_convert2 - _null_ _null_ )); +DATA(insert OID = 1813 ( convert PGNSP PGUID 12 1 0 f f t f s 3 17 "17 19 19" _null_ _null_ _null_ pg_convert - _null_ _null_ )); DESCR("convert string with specified encoding names"); -DATA(insert OID = 1619 ( convert_using PGNSP PGUID 12 1 0 f f t f s 2 25 "25 25" _null_ _null_ _null_ pg_convert_using - _null_ _null_ )); +DATA(insert OID = 1619 ( convert_using PGNSP PGUID 12 1 0 f f t f s 2 17 "25 25" _null_ _null_ _null_ pg_convert_using - _null_ _null_ )); DESCR("convert string with specified conversion name"); DATA(insert OID = 1264 ( pg_char_to_encoding PGNSP PGUID 12 1 0 f f t f s 1 23 "19" _null_ _null_ _null_ PG_char_to_encoding - _null_ _null_ )); diff --git a/src/include/mb/pg_wchar.h b/src/include/mb/pg_wchar.h index 2cb0656cb98..19b5fe1e3a7 100644 --- a/src/include/mb/pg_wchar.h +++ b/src/include/mb/pg_wchar.h @@ -1,4 +1,4 @@ -/* $PostgreSQL: pgsql/src/include/mb/pg_wchar.h,v 1.72 2007/04/15 10:56:30 ishii Exp $ */ +/* $PostgreSQL: pgsql/src/include/mb/pg_wchar.h,v 1.73 2007/09/18 17:41:17 adunstan Exp $ */ #ifndef PG_WCHAR_H #define PG_WCHAR_H @@ -372,6 +372,8 @@ extern void UtfToLocal(const unsigned char *utf, unsigned char *iso, extern bool pg_verifymbstr(const char *mbstr, int len, bool noError); extern bool pg_verify_mbstr(int encoding, const char *mbstr, int len, bool noError); +extern int pg_verify_mbstr_len(int encoding, const char *mbstr, int len, + bool noError); extern void report_invalid_encoding(int encoding, const char *mbstr, int len); extern void report_untranslatable_char(int src_encoding, int dest_encoding, diff --git a/src/include/utils/builtins.h b/src/include/utils/builtins.h index 5d581b6ea5c..c1610f320f5 100644 --- a/src/include/utils/builtins.h +++ b/src/include/utils/builtins.h @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/utils/builtins.h,v 1.302 2007/09/04 16:41:43 adunstan Exp $ + * $PostgreSQL: pgsql/src/include/utils/builtins.h,v 1.303 2007/09/18 17:41:17 adunstan Exp $ * *------------------------------------------------------------------------- */ @@ -902,7 +902,9 @@ extern Datum PG_char_to_encoding(PG_FUNCTION_ARGS); extern Datum PG_character_set_name(PG_FUNCTION_ARGS); extern Datum PG_character_set_id(PG_FUNCTION_ARGS); extern Datum pg_convert(PG_FUNCTION_ARGS); -extern Datum pg_convert2(PG_FUNCTION_ARGS); +extern Datum pg_convert_to(PG_FUNCTION_ARGS); +extern Datum pg_convert_from(PG_FUNCTION_ARGS); +extern Datum length_in_encoding(PG_FUNCTION_ARGS); /* format_type.c */ extern Datum format_type(PG_FUNCTION_ARGS); |