/* * This file contains public functions for conversion between * client encoding and server internal encoding. * (currently mule internal code (mic) is used) * Tatsuo Ishii * $Id: mbutils.c,v 1.27 2001/11/20 01:32:29 ishii Exp $ */ #include "postgres.h" #include "miscadmin.h" #include "mb/pg_wchar.h" #include "utils/builtins.h" /* * We handle for actual FE and BE encoding setting encoding-identificator * and encoding-name too. It prevent searching and conversion from encoding * to encoding name in getdatabaseencoding() and other routines. * * Default is PG_SQL_ASCII encoding (but this is never used, because * backend during startup init it by SetDatabaseEncoding()). * * Karel Zak (Aug 2001) */ static pg_enc2name *ClientEncoding = &pg_enc2name_tbl[PG_SQL_ASCII]; static pg_enc2name *DatabaseEncoding = &pg_enc2name_tbl[PG_SQL_ASCII]; static to_mic_converter client_to_mic; /* something to MIC */ static from_mic_converter client_from_mic; /* MIC to something */ static to_mic_converter server_to_mic; /* something to MIC */ static from_mic_converter server_from_mic; /* MIC to something */ /* * find encoding table entry by encoding */ pg_enconv * pg_get_enconv_by_encoding(int encoding) { if (PG_VALID_ENCODING(encoding)) { Assert((&pg_enconv_tbl[encoding])->encoding == encoding); return &pg_enconv_tbl[encoding]; } return 0; } /* * Find appropriate encoding conversion functions. If no such * functions found, returns -1. * * Arguments: * * src, dest (in): source and destination encoding ids * * src_to_mic (out): pointer to a function which converts src to * mic/unicode according to dest. if src == mic/unicode or no * appropriate function found, set to 0. * * dest_from_mic (out): pointer to a function which converts * mic/unicode to dest according to src. if dest == mic/unicode or no * appropriate function found, set to 0. */ int pg_find_encoding_converters(int src, int dest, to_mic_converter *src_to_mic, from_mic_converter *dest_from_mic) { if (src == dest) { /* src == dest? */ *src_to_mic = *dest_from_mic = 0; } else if (src == PG_MULE_INTERNAL) { /* src == MULE_INETRNAL? */ *dest_from_mic = pg_get_enconv_by_encoding(dest)->from_mic; if (*dest_from_mic == 0) return (-1); *src_to_mic = 0; } else if (dest == PG_MULE_INTERNAL) { /* dest == MULE_INETRNAL? */ *src_to_mic = pg_get_enconv_by_encoding(src)->to_mic; if (*src_to_mic == 0) return (-1); *dest_from_mic = 0; } else if (src == PG_UTF8) { /* src == UNICODE? */ *dest_from_mic = pg_get_enconv_by_encoding(dest)->from_unicode; if (*dest_from_mic == 0) return (-1); *src_to_mic = 0; } else if (dest == PG_UTF8) { /* dest == UNICODE? */ *src_to_mic = pg_get_enconv_by_encoding(src)->to_unicode; if (*src_to_mic == 0) return (-1); *dest_from_mic = 0; } else { *src_to_mic = pg_get_enconv_by_encoding(src)->to_mic; *dest_from_mic = pg_get_enconv_by_encoding(dest)->from_mic; if (*src_to_mic == 0 || *dest_from_mic == 0) return (-1); } return (0); } /* * set the client encoding. if encoding conversion between * client/server encoding is not supported, returns -1 */ int pg_set_client_encoding(int encoding) { int current_server_encoding = DatabaseEncoding->encoding; if (!PG_VALID_FE_ENCODING(encoding)) return (-1); if (pg_find_encoding_converters(encoding, current_server_encoding, &client_to_mic, &server_from_mic) < 0) return (-1); ClientEncoding = &pg_enc2name_tbl[encoding]; Assert(ClientEncoding->encoding == encoding); if (pg_find_encoding_converters(current_server_encoding, encoding, &server_to_mic, &client_from_mic) < 0) return (-1); return 0; } /* * returns the current client encoding */ int pg_get_client_encoding(void) { Assert(ClientEncoding); return (ClientEncoding->encoding); } /* * returns the current client encoding name */ const char * pg_get_client_encoding_name(void) { Assert(ClientEncoding); return (ClientEncoding->name); } /* * Convert src encoding and returns it. Actual conversion is done by * src_to_mic and dest_from_mic, which can be obtained by * pg_find_encoding_converters(). The reason we require two conversion * functions is that we have an intermediate encoding: MULE_INTERNAL * Using intermediate encodings will reduce the number of functions * doing encoding conversions. Special case is either src or dest is * the intermediate encoding itself. In this case, you don't need src * or dest (setting 0 will indicate there's no conversion * function). Another case is you have direct-conversion function from * src to dest. In this case either src_to_mic or dest_from_mic could * be set to 0 also. * * Note that If src or dest is UNICODE, we have to do * direct-conversion, since we don't support conversion bwteen UNICODE * and MULE_INTERNAL, we cannot go through MULE_INTERNAL. * * CASE 1: if no conversion is required, then the given pointer s is returned. * * CASE 2: if conversion is required, a palloc'd string is returned. * * Callers must check whether return value differs from passed value * to determine whether to pfree the result or not! * * Note: we assume that conversion cannot cause more than a 4-to-1 growth * in the length of the string --- is this enough? */ unsigned char * pg_do_encoding_conversion(unsigned char *src, int len, to_mic_converter src_to_mic, from_mic_converter dest_from_mic) { unsigned char *result = src; unsigned char *buf; if (src_to_mic) { buf = (unsigned char *) palloc(len * 4 + 1); (*src_to_mic) (result, buf, len); result = buf; len = strlen(result); } if (dest_from_mic) { buf = (unsigned char *) palloc(len * 4 + 1); (*dest_from_mic) (result, buf, len); if (result != src) pfree(result); /* release first buffer */ result = buf; } return result; } /* * Convert string using encoding_nanme. We assume that string's * encoding is same as DB encoding. * * TEXT convert(TEXT string, NAME encoding_name) */ Datum pg_convert(PG_FUNCTION_ARGS) { Datum string = PG_GETARG_DATUM(0); Datum dest_encoding_name = PG_GETARG_DATUM(1); Datum src_encoding_name = DirectFunctionCall1( namein, CStringGetDatum(DatabaseEncoding->name)); Datum result; result = DirectFunctionCall3( pg_convert2, string, src_encoding_name, dest_encoding_name); /* free memory allocated by namein */ pfree((void *)dest_encoding_name); PG_RETURN_TEXT_P(result); } /* * Convert string using encoding_nanme. * * TEXT convert(TEXT string, NAME src_encoding_name, NAME dest_encoding_name) */ Datum pg_convert2(PG_FUNCTION_ARGS) { text *string = PG_GETARG_TEXT_P(0); char *src_encoding_name = NameStr(*PG_GETARG_NAME(1)); int src_encoding = pg_char_to_encoding(src_encoding_name); char *dest_encoding_name = NameStr(*PG_GETARG_NAME(2)); int dest_encoding = pg_char_to_encoding(dest_encoding_name); to_mic_converter src; from_mic_converter dest; unsigned char *result; text *retval; unsigned char *str; int len; if (src_encoding < 0) elog(ERROR, "Invalid source encoding name %s", src_encoding_name); if (dest_encoding < 0) elog(ERROR, "Invalid destination encoding name %s", dest_encoding_name); if (pg_find_encoding_converters(src_encoding, dest_encoding, &src, &dest) < 0) { elog(ERROR, "Conversion from %s to %s is not possible", src_encoding_name, dest_encoding_name); } /* make sure that source string is null terminated */ len = VARSIZE(string) - VARHDRSZ; str = palloc(len + 1); memcpy(str, VARDATA(string), len); *(str + len) = '\0'; result = pg_do_encoding_conversion(str, len, src, dest); if (result == NULL) elog(ERROR, "Encoding conversion failed"); /* build text data type structre. we cannot use textin() here, since textin assumes that input string encoding is same as database encoding. */ len = strlen(result) + VARHDRSZ; retval = palloc(len); VARATT_SIZEP(retval) = len; memcpy(VARDATA(retval), result, len - VARHDRSZ); if (result != str) pfree(result); pfree(str); /* free memory if allocated by the toaster */ PG_FREE_IF_COPY(string, 0); PG_RETURN_TEXT_P(retval); } /* * convert client encoding to server encoding. * * CASE 1: if no conversion is required, then the given pointer s is returned. * * CASE 2: if conversion is required, a palloc'd string is returned. * * Callers must check whether return value differs from passed value * to determine whether to pfree the result or not! * * Note: we assume that conversion cannot cause more than a 4-to-1 growth * in the length of the string --- is this enough? */ unsigned char * pg_client_to_server(unsigned char *s, int len) { Assert(DatabaseEncoding); Assert(ClientEncoding); if (ClientEncoding->encoding == DatabaseEncoding->encoding) return s; return pg_do_encoding_conversion(s, len, client_to_mic, server_from_mic); } /* * convert server encoding to client encoding. * * CASE 1: if no conversion is required, then the given pointer s is returned. * * CASE 2: if conversion is required, a palloc'd string is returned. * * Callers must check whether return value differs from passed value * to determine whether to pfree the result or not! * * Note: we assume that conversion cannot cause more than a 4-to-1 growth * in the length of the string --- is this enough? */ unsigned char * pg_server_to_client(unsigned char *s, int len) { Assert(DatabaseEncoding); Assert(ClientEncoding); if (ClientEncoding->encoding == DatabaseEncoding->encoding) return s; return pg_do_encoding_conversion(s, len, server_to_mic, client_from_mic); } /* convert a multi-byte string to a wchar */ int pg_mb2wchar(const unsigned char *from, pg_wchar *to) { return (*pg_wchar_table[DatabaseEncoding->encoding].mb2wchar_with_len) (from, to, strlen(from)); } /* convert a multi-byte string to a wchar with a limited length */ int pg_mb2wchar_with_len(const unsigned char *from, pg_wchar *to, int len) { return (*pg_wchar_table[DatabaseEncoding->encoding].mb2wchar_with_len) (from, to, len); } /* returns the byte length of a multi-byte word */ int pg_mblen(const unsigned char *mbstr) { return ((*pg_wchar_table[DatabaseEncoding->encoding].mblen) (mbstr)); } /* returns the length (counted as a wchar) of a multi-byte string */ int pg_mbstrlen(const unsigned char *mbstr) { int len = 0; while (*mbstr) { mbstr += pg_mblen(mbstr); len++; } return (len); } /* returns the length (counted as a wchar) of a multi-byte string (not necessarily NULL terminated) */ int pg_mbstrlen_with_len(const unsigned char *mbstr, int limit) { int len = 0; int l; while (limit > 0 && *mbstr) { l = pg_mblen(mbstr); limit -= l; mbstr += l; len++; } return (len); } /* * returns the byte length of a multi-byte string * (not necessarily NULL terminated) * that is no longer than limit. * this function does not break multi-byte word boundary. */ int pg_mbcliplen(const unsigned char *mbstr, int len, int limit) { int clen = 0; int l; while (len > 0 && *mbstr) { l = pg_mblen(mbstr); if ((clen + l) > limit) break; clen += l; if (clen == limit) break; len -= l; mbstr += l; } return (clen); } /* * Similar to pg_mbcliplen but the limit parameter specifies the * character length, not the byte length. */ int pg_mbcharcliplen(const unsigned char *mbstr, int len, int limit) { int clen = 0; int nch = 0; int l; while (len > 0 && *mbstr) { l = pg_mblen(mbstr); nch++; if (nch > limit) break; clen += l; len -= l; mbstr += l; } return (clen); } void SetDatabaseEncoding(int encoding) { if (!PG_VALID_BE_ENCODING(encoding)) elog(ERROR, "SetDatabaseEncoding(): invalid database encoding"); DatabaseEncoding = &pg_enc2name_tbl[encoding]; Assert(DatabaseEncoding->encoding == encoding); } int GetDatabaseEncoding(void) { Assert(DatabaseEncoding); return (DatabaseEncoding->encoding); } const char * GetDatabaseEncodingName(void) { Assert(DatabaseEncoding); return (DatabaseEncoding->name); } Datum getdatabaseencoding(PG_FUNCTION_ARGS) { Assert(DatabaseEncoding); return DirectFunctionCall1(namein, CStringGetDatum(DatabaseEncoding->name)); } Datum pg_client_encoding(PG_FUNCTION_ARGS) { Assert(ClientEncoding); return DirectFunctionCall1(namein, CStringGetDatum(ClientEncoding->name)); }