diff options
Diffstat (limited to 'src/backend/utils/mb/conv.c')
-rw-r--r-- | src/backend/utils/mb/conv.c | 327 |
1 files changed, 171 insertions, 156 deletions
diff --git a/src/backend/utils/mb/conv.c b/src/backend/utils/mb/conv.c index d5c5c74a9ce..f957b6efd32 100644 --- a/src/backend/utils/mb/conv.c +++ b/src/backend/utils/mb/conv.c @@ -302,47 +302,62 @@ compare4(const void *p1, const void *p2) } /* - * convert 32bit wide character to mutibye stream pointed to by iso + * store 32bit character representation into multibyte stream */ -static unsigned char * -set_iso_code(unsigned char *iso, uint32 code) +static inline unsigned char * +store_coded_char(unsigned char *dest, uint32 code) { if (code & 0xff000000) - *iso++ = code >> 24; + *dest++ = code >> 24; if (code & 0x00ff0000) - *iso++ = (code & 0x00ff0000) >> 16; + *dest++ = code >> 16; if (code & 0x0000ff00) - *iso++ = (code & 0x0000ff00) >> 8; + *dest++ = code >> 8; if (code & 0x000000ff) - *iso++ = code & 0x000000ff; - return iso; + *dest++ = code; + return dest; } /* * UTF8 ---> local code * - * utf: input UTF8 string (need not be null-terminated). + * utf: input string in UTF8 encoding (need not be null-terminated) + * len: length of input string (in bytes) * iso: pointer to the output area (must be large enough!) - * map: the conversion map. - * cmap: the conversion map for combined characters. - * (optional) - * size1: the size of the conversion map. - * size2: the size of the conversion map for combined characters - * (optional) - * encoding: the PG identifier for the local encoding. - * len: length of input string. + (output string will be null-terminated) + * map: conversion map for single characters + * mapsize: number of entries in the conversion map + * cmap: conversion map for combined characters + * (optional, pass NULL if none) + * cmapsize: number of entries in the conversion map for combined characters + * (optional, pass 0 if none) + * conv_func: algorithmic encoding conversion function + * (optional, pass NULL if none) + * encoding: PG identifier for the local encoding + * + * For each character, the cmap (if provided) is consulted first; if no match, + * the map is consulted next; if still no match, the conv_func (if provided) + * is applied. An error is raised if no match is found. + * + * See pg_wchar.h for more details about the data structures used here. */ void -UtfToLocal(const unsigned char *utf, unsigned char *iso, - const pg_utf_to_local *map, const pg_utf_to_local_combined *cmap, - int size1, int size2, int encoding, int len) +UtfToLocal(const unsigned char *utf, int len, + unsigned char *iso, + const pg_utf_to_local *map, int mapsize, + const pg_utf_to_local_combined *cmap, int cmapsize, + utf_local_conversion_func conv_func, + int encoding) { uint32 iutf; - uint32 cutf[2]; - uint32 code; - pg_utf_to_local *p; - pg_utf_to_local_combined *cp; int l; + const pg_utf_to_local *p; + const pg_utf_to_local_combined *cp; + + if (!PG_VALID_ENCODING(encoding)) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("invalid encoding number: %d", encoding))); for (; len > 0; len -= l) { @@ -351,7 +366,6 @@ UtfToLocal(const unsigned char *utf, unsigned char *iso, break; l = pg_utf_mblen(utf); - if (len < l) break; @@ -360,11 +374,13 @@ UtfToLocal(const unsigned char *utf, unsigned char *iso, if (l == 1) { - /* ASCII case is easy */ + /* ASCII case is easy, assume it's one-to-one conversion */ *iso++ = *utf++; continue; } - else if (l == 2) + + /* collect coded char of length l */ + if (l == 2) { iutf = *utf++ << 8; iutf |= *utf++; @@ -388,15 +404,14 @@ UtfToLocal(const unsigned char *utf, unsigned char *iso, iutf = 0; /* keep compiler quiet */ } - /* - * first, try with combined map if possible - */ + /* First, try with combined map if possible */ if (cmap && len > l) { const unsigned char *utf_save = utf; int len_save = len; int l_save = l; + /* collect next character, same as above */ len -= l; l = pg_utf_mblen(utf); @@ -406,83 +421,83 @@ UtfToLocal(const unsigned char *utf, unsigned char *iso, if (!pg_utf8_islegal(utf, l)) break; - cutf[0] = iutf; - - if (l == 1) + /* We assume ASCII character cannot be in combined map */ + if (l > 1) { - if (len_save > 1) + uint32 iutf2; + uint32 cutf[2]; + + if (l == 2) + { + iutf2 = *utf++ << 8; + iutf2 |= *utf++; + } + else if (l == 3) + { + iutf2 = *utf++ << 16; + iutf2 |= *utf++ << 8; + iutf2 |= *utf++; + } + else if (l == 4) + { + iutf2 = *utf++ << 24; + iutf2 |= *utf++ << 16; + iutf2 |= *utf++ << 8; + iutf2 |= *utf++; + } + else { - p = bsearch(&cutf[0], map, size1, - sizeof(pg_utf_to_local), compare1); - if (p == NULL) - report_untranslatable_char(PG_UTF8, encoding, - (const char *) (utf_save - l_save), len_save); - iso = set_iso_code(iso, p->code); + elog(ERROR, "unsupported character length %d", l); + iutf2 = 0; /* keep compiler quiet */ } - /* ASCII case is easy */ - *iso++ = *utf++; - continue; - } - else if (l == 2) - { - iutf = *utf++ << 8; - iutf |= *utf++; - } - else if (l == 3) - { - iutf = *utf++ << 16; - iutf |= *utf++ << 8; - iutf |= *utf++; - } - else if (l == 4) - { - iutf = *utf++ << 24; - iutf |= *utf++ << 16; - iutf |= *utf++ << 8; - iutf |= *utf++; - } - else - { - elog(ERROR, "unsupported character length %d", l); - iutf = 0; /* keep compiler quiet */ - } + cutf[0] = iutf; + cutf[1] = iutf2; - cutf[1] = iutf; - cp = bsearch(cutf, cmap, size2, - sizeof(pg_utf_to_local_combined), compare3); - if (cp) - code = cp->code; - else - { - /* not found in combined map. try with ordinary map */ - p = bsearch(&cutf[0], map, size1, - sizeof(pg_utf_to_local), compare1); - if (p == NULL) - report_untranslatable_char(PG_UTF8, encoding, - (const char *) (utf_save - l_save), len_save); - iso = set_iso_code(iso, p->code); - - p = bsearch(&cutf[1], map, size1, - sizeof(pg_utf_to_local), compare1); - if (p == NULL) - report_untranslatable_char(PG_UTF8, encoding, - (const char *) (utf - l), len); - code = p->code; + cp = bsearch(cutf, cmap, cmapsize, + sizeof(pg_utf_to_local_combined), compare3); + + if (cp) + { + iso = store_coded_char(iso, cp->code); + continue; + } } + + /* fail, so back up to reprocess second character next time */ + utf = utf_save; + len = len_save; + l = l_save; } - else /* no cmap or no remaining data */ + + /* Now check ordinary map */ + p = bsearch(&iutf, map, mapsize, + sizeof(pg_utf_to_local), compare1); + + if (p) { - p = bsearch(&iutf, map, size1, - sizeof(pg_utf_to_local), compare1); - if (p == NULL) - report_untranslatable_char(PG_UTF8, encoding, - (const char *) (utf - l), len); - code = p->code; + iso = store_coded_char(iso, p->code); + continue; + } + + /* if there's a conversion function, try that */ + if (conv_func) + { + uint32 converted = (*conv_func) (iutf); + + if (converted) + { + iso = store_coded_char(iso, converted); + continue; + } } - iso = set_iso_code(iso, code); + + /* failed to translate this character */ + report_untranslatable_char(PG_UTF8, encoding, + (const char *) (utf - l), len); } + /* if we broke out of loop early, must be invalid input */ if (len > 0) report_invalid_encoding(PG_UTF8, (const char *) utf, len); @@ -492,26 +507,38 @@ UtfToLocal(const unsigned char *utf, unsigned char *iso, /* * local code ---> UTF8 * - * iso: input local string (need not be null-terminated). + * iso: input string in local encoding (need not be null-terminated) + * len: length of input string (in bytes) * utf: pointer to the output area (must be large enough!) - * map: the conversion map. - * cmap: the conversion map for combined characters. - * (optional) - * size1: the size of the conversion map. - * size2: the size of the conversion map for combined characters - * (optional) - * encoding: the PG identifier for the local encoding. - * len: length of input string. + (output string will be null-terminated) + * map: conversion map for single characters + * mapsize: number of entries in the conversion map + * cmap: conversion map for combined characters + * (optional, pass NULL if none) + * cmapsize: number of entries in the conversion map for combined characters + * (optional, pass 0 if none) + * conv_func: algorithmic encoding conversion function + * (optional, pass NULL if none) + * encoding: PG identifier for the local encoding + * + * For each character, the map is consulted first; if no match, the cmap + * (if provided) is consulted next; if still no match, the conv_func + * (if provided) is applied. An error is raised if no match is found. + * + * See pg_wchar.h for more details about the data structures used here. */ void -LocalToUtf(const unsigned char *iso, unsigned char *utf, - const pg_local_to_utf *map, const pg_local_to_utf_combined *cmap, - int size1, int size2, int encoding, int len) +LocalToUtf(const unsigned char *iso, int len, + unsigned char *utf, + const pg_local_to_utf *map, int mapsize, + const pg_local_to_utf_combined *cmap, int cmapsize, + utf_local_conversion_func conv_func, + int encoding) { - unsigned int iiso; + uint32 iiso; int l; - pg_local_to_utf *p; - pg_local_to_utf_combined *cp; + const pg_local_to_utf *p; + const pg_local_to_utf_combined *cp; if (!PG_VALID_ENCODING(encoding)) ereport(ERROR, @@ -526,7 +553,7 @@ LocalToUtf(const unsigned char *iso, unsigned char *utf, if (!IS_HIGHBIT_SET(*iso)) { - /* ASCII case is easy */ + /* ASCII case is easy, assume it's one-to-one conversion */ *utf++ = *iso++; l = 1; continue; @@ -536,6 +563,7 @@ LocalToUtf(const unsigned char *iso, unsigned char *utf, if (l < 0) break; + /* collect coded char of length l */ if (l == 1) iiso = *iso++; else if (l == 2) @@ -562,61 +590,48 @@ LocalToUtf(const unsigned char *iso, unsigned char *utf, iiso = 0; /* keep compiler quiet */ } - p = bsearch(&iiso, map, size1, + /* First check ordinary map */ + p = bsearch(&iiso, map, mapsize, sizeof(pg_local_to_utf), compare2); - if (p == NULL) + if (p) { - /* - * not found in the ordinary map. if there's a combined character - * map, try with it - */ - if (cmap) - { - cp = bsearch(&iiso, cmap, size2, - sizeof(pg_local_to_utf_combined), compare4); + utf = store_coded_char(utf, p->utf); + continue; + } - if (cp) - { - if (cp->utf1 & 0xff000000) - *utf++ = cp->utf1 >> 24; - if (cp->utf1 & 0x00ff0000) - *utf++ = (cp->utf1 & 0x00ff0000) >> 16; - if (cp->utf1 & 0x0000ff00) - *utf++ = (cp->utf1 & 0x0000ff00) >> 8; - if (cp->utf1 & 0x000000ff) - *utf++ = cp->utf1 & 0x000000ff; - - if (cp->utf2 & 0xff000000) - *utf++ = cp->utf2 >> 24; - if (cp->utf2 & 0x00ff0000) - *utf++ = (cp->utf2 & 0x00ff0000) >> 16; - if (cp->utf2 & 0x0000ff00) - *utf++ = (cp->utf2 & 0x0000ff00) >> 8; - if (cp->utf2 & 0x000000ff) - *utf++ = cp->utf2 & 0x000000ff; + /* If there's a combined character map, try that */ + if (cmap) + { + cp = bsearch(&iiso, cmap, cmapsize, + sizeof(pg_local_to_utf_combined), compare4); - continue; - } + if (cp) + { + utf = store_coded_char(utf, cp->utf1); + utf = store_coded_char(utf, cp->utf2); + continue; } - - report_untranslatable_char(encoding, PG_UTF8, - (const char *) (iso - l), len); - } - else + + /* if there's a conversion function, try that */ + if (conv_func) { - if (p->utf & 0xff000000) - *utf++ = p->utf >> 24; - if (p->utf & 0x00ff0000) - *utf++ = (p->utf & 0x00ff0000) >> 16; - if (p->utf & 0x0000ff00) - *utf++ = (p->utf & 0x0000ff00) >> 8; - if (p->utf & 0x000000ff) - *utf++ = p->utf & 0x000000ff; + uint32 converted = (*conv_func) (iiso); + + if (converted) + { + utf = store_coded_char(utf, converted); + continue; + } } + + /* failed to translate this character */ + report_untranslatable_char(encoding, PG_UTF8, + (const char *) (iso - l), len); } + /* if we broke out of loop early, must be invalid input */ if (len > 0) report_invalid_encoding(encoding, (const char *) iso, len); |