aboutsummaryrefslogtreecommitdiff
path: root/src/backend/utils/mb/conv.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/backend/utils/mb/conv.c')
-rw-r--r--src/backend/utils/mb/conv.c327
1 files changed, 171 insertions, 156 deletions
diff --git a/src/backend/utils/mb/conv.c b/src/backend/utils/mb/conv.c
index d5c5c74a9ce..f957b6efd32 100644
--- a/src/backend/utils/mb/conv.c
+++ b/src/backend/utils/mb/conv.c
@@ -302,47 +302,62 @@ compare4(const void *p1, const void *p2)
}
/*
- * convert 32bit wide character to mutibye stream pointed to by iso
+ * store 32bit character representation into multibyte stream
*/
-static unsigned char *
-set_iso_code(unsigned char *iso, uint32 code)
+static inline unsigned char *
+store_coded_char(unsigned char *dest, uint32 code)
{
if (code & 0xff000000)
- *iso++ = code >> 24;
+ *dest++ = code >> 24;
if (code & 0x00ff0000)
- *iso++ = (code & 0x00ff0000) >> 16;
+ *dest++ = code >> 16;
if (code & 0x0000ff00)
- *iso++ = (code & 0x0000ff00) >> 8;
+ *dest++ = code >> 8;
if (code & 0x000000ff)
- *iso++ = code & 0x000000ff;
- return iso;
+ *dest++ = code;
+ return dest;
}
/*
* UTF8 ---> local code
*
- * utf: input UTF8 string (need not be null-terminated).
+ * utf: input string in UTF8 encoding (need not be null-terminated)
+ * len: length of input string (in bytes)
* iso: pointer to the output area (must be large enough!)
- * map: the conversion map.
- * cmap: the conversion map for combined characters.
- * (optional)
- * size1: the size of the conversion map.
- * size2: the size of the conversion map for combined characters
- * (optional)
- * encoding: the PG identifier for the local encoding.
- * len: length of input string.
+ (output string will be null-terminated)
+ * map: conversion map for single characters
+ * mapsize: number of entries in the conversion map
+ * cmap: conversion map for combined characters
+ * (optional, pass NULL if none)
+ * cmapsize: number of entries in the conversion map for combined characters
+ * (optional, pass 0 if none)
+ * conv_func: algorithmic encoding conversion function
+ * (optional, pass NULL if none)
+ * encoding: PG identifier for the local encoding
+ *
+ * For each character, the cmap (if provided) is consulted first; if no match,
+ * the map is consulted next; if still no match, the conv_func (if provided)
+ * is applied. An error is raised if no match is found.
+ *
+ * See pg_wchar.h for more details about the data structures used here.
*/
void
-UtfToLocal(const unsigned char *utf, unsigned char *iso,
- const pg_utf_to_local *map, const pg_utf_to_local_combined *cmap,
- int size1, int size2, int encoding, int len)
+UtfToLocal(const unsigned char *utf, int len,
+ unsigned char *iso,
+ const pg_utf_to_local *map, int mapsize,
+ const pg_utf_to_local_combined *cmap, int cmapsize,
+ utf_local_conversion_func conv_func,
+ int encoding)
{
uint32 iutf;
- uint32 cutf[2];
- uint32 code;
- pg_utf_to_local *p;
- pg_utf_to_local_combined *cp;
int l;
+ const pg_utf_to_local *p;
+ const pg_utf_to_local_combined *cp;
+
+ if (!PG_VALID_ENCODING(encoding))
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("invalid encoding number: %d", encoding)));
for (; len > 0; len -= l)
{
@@ -351,7 +366,6 @@ UtfToLocal(const unsigned char *utf, unsigned char *iso,
break;
l = pg_utf_mblen(utf);
-
if (len < l)
break;
@@ -360,11 +374,13 @@ UtfToLocal(const unsigned char *utf, unsigned char *iso,
if (l == 1)
{
- /* ASCII case is easy */
+ /* ASCII case is easy, assume it's one-to-one conversion */
*iso++ = *utf++;
continue;
}
- else if (l == 2)
+
+ /* collect coded char of length l */
+ if (l == 2)
{
iutf = *utf++ << 8;
iutf |= *utf++;
@@ -388,15 +404,14 @@ UtfToLocal(const unsigned char *utf, unsigned char *iso,
iutf = 0; /* keep compiler quiet */
}
- /*
- * first, try with combined map if possible
- */
+ /* First, try with combined map if possible */
if (cmap && len > l)
{
const unsigned char *utf_save = utf;
int len_save = len;
int l_save = l;
+ /* collect next character, same as above */
len -= l;
l = pg_utf_mblen(utf);
@@ -406,83 +421,83 @@ UtfToLocal(const unsigned char *utf, unsigned char *iso,
if (!pg_utf8_islegal(utf, l))
break;
- cutf[0] = iutf;
-
- if (l == 1)
+ /* We assume ASCII character cannot be in combined map */
+ if (l > 1)
{
- if (len_save > 1)
+ uint32 iutf2;
+ uint32 cutf[2];
+
+ if (l == 2)
+ {
+ iutf2 = *utf++ << 8;
+ iutf2 |= *utf++;
+ }
+ else if (l == 3)
+ {
+ iutf2 = *utf++ << 16;
+ iutf2 |= *utf++ << 8;
+ iutf2 |= *utf++;
+ }
+ else if (l == 4)
+ {
+ iutf2 = *utf++ << 24;
+ iutf2 |= *utf++ << 16;
+ iutf2 |= *utf++ << 8;
+ iutf2 |= *utf++;
+ }
+ else
{
- p = bsearch(&cutf[0], map, size1,
- sizeof(pg_utf_to_local), compare1);
- if (p == NULL)
- report_untranslatable_char(PG_UTF8, encoding,
- (const char *) (utf_save - l_save), len_save);
- iso = set_iso_code(iso, p->code);
+ elog(ERROR, "unsupported character length %d", l);
+ iutf2 = 0; /* keep compiler quiet */
}
- /* ASCII case is easy */
- *iso++ = *utf++;
- continue;
- }
- else if (l == 2)
- {
- iutf = *utf++ << 8;
- iutf |= *utf++;
- }
- else if (l == 3)
- {
- iutf = *utf++ << 16;
- iutf |= *utf++ << 8;
- iutf |= *utf++;
- }
- else if (l == 4)
- {
- iutf = *utf++ << 24;
- iutf |= *utf++ << 16;
- iutf |= *utf++ << 8;
- iutf |= *utf++;
- }
- else
- {
- elog(ERROR, "unsupported character length %d", l);
- iutf = 0; /* keep compiler quiet */
- }
+ cutf[0] = iutf;
+ cutf[1] = iutf2;
- cutf[1] = iutf;
- cp = bsearch(cutf, cmap, size2,
- sizeof(pg_utf_to_local_combined), compare3);
- if (cp)
- code = cp->code;
- else
- {
- /* not found in combined map. try with ordinary map */
- p = bsearch(&cutf[0], map, size1,
- sizeof(pg_utf_to_local), compare1);
- if (p == NULL)
- report_untranslatable_char(PG_UTF8, encoding,
- (const char *) (utf_save - l_save), len_save);
- iso = set_iso_code(iso, p->code);
-
- p = bsearch(&cutf[1], map, size1,
- sizeof(pg_utf_to_local), compare1);
- if (p == NULL)
- report_untranslatable_char(PG_UTF8, encoding,
- (const char *) (utf - l), len);
- code = p->code;
+ cp = bsearch(cutf, cmap, cmapsize,
+ sizeof(pg_utf_to_local_combined), compare3);
+
+ if (cp)
+ {
+ iso = store_coded_char(iso, cp->code);
+ continue;
+ }
}
+
+ /* fail, so back up to reprocess second character next time */
+ utf = utf_save;
+ len = len_save;
+ l = l_save;
}
- else /* no cmap or no remaining data */
+
+ /* Now check ordinary map */
+ p = bsearch(&iutf, map, mapsize,
+ sizeof(pg_utf_to_local), compare1);
+
+ if (p)
{
- p = bsearch(&iutf, map, size1,
- sizeof(pg_utf_to_local), compare1);
- if (p == NULL)
- report_untranslatable_char(PG_UTF8, encoding,
- (const char *) (utf - l), len);
- code = p->code;
+ iso = store_coded_char(iso, p->code);
+ continue;
+ }
+
+ /* if there's a conversion function, try that */
+ if (conv_func)
+ {
+ uint32 converted = (*conv_func) (iutf);
+
+ if (converted)
+ {
+ iso = store_coded_char(iso, converted);
+ continue;
+ }
}
- iso = set_iso_code(iso, code);
+
+ /* failed to translate this character */
+ report_untranslatable_char(PG_UTF8, encoding,
+ (const char *) (utf - l), len);
}
+ /* if we broke out of loop early, must be invalid input */
if (len > 0)
report_invalid_encoding(PG_UTF8, (const char *) utf, len);
@@ -492,26 +507,38 @@ UtfToLocal(const unsigned char *utf, unsigned char *iso,
/*
* local code ---> UTF8
*
- * iso: input local string (need not be null-terminated).
+ * iso: input string in local encoding (need not be null-terminated)
+ * len: length of input string (in bytes)
* utf: pointer to the output area (must be large enough!)
- * map: the conversion map.
- * cmap: the conversion map for combined characters.
- * (optional)
- * size1: the size of the conversion map.
- * size2: the size of the conversion map for combined characters
- * (optional)
- * encoding: the PG identifier for the local encoding.
- * len: length of input string.
+ (output string will be null-terminated)
+ * map: conversion map for single characters
+ * mapsize: number of entries in the conversion map
+ * cmap: conversion map for combined characters
+ * (optional, pass NULL if none)
+ * cmapsize: number of entries in the conversion map for combined characters
+ * (optional, pass 0 if none)
+ * conv_func: algorithmic encoding conversion function
+ * (optional, pass NULL if none)
+ * encoding: PG identifier for the local encoding
+ *
+ * For each character, the map is consulted first; if no match, the cmap
+ * (if provided) is consulted next; if still no match, the conv_func
+ * (if provided) is applied. An error is raised if no match is found.
+ *
+ * See pg_wchar.h for more details about the data structures used here.
*/
void
-LocalToUtf(const unsigned char *iso, unsigned char *utf,
- const pg_local_to_utf *map, const pg_local_to_utf_combined *cmap,
- int size1, int size2, int encoding, int len)
+LocalToUtf(const unsigned char *iso, int len,
+ unsigned char *utf,
+ const pg_local_to_utf *map, int mapsize,
+ const pg_local_to_utf_combined *cmap, int cmapsize,
+ utf_local_conversion_func conv_func,
+ int encoding)
{
- unsigned int iiso;
+ uint32 iiso;
int l;
- pg_local_to_utf *p;
- pg_local_to_utf_combined *cp;
+ const pg_local_to_utf *p;
+ const pg_local_to_utf_combined *cp;
if (!PG_VALID_ENCODING(encoding))
ereport(ERROR,
@@ -526,7 +553,7 @@ LocalToUtf(const unsigned char *iso, unsigned char *utf,
if (!IS_HIGHBIT_SET(*iso))
{
- /* ASCII case is easy */
+ /* ASCII case is easy, assume it's one-to-one conversion */
*utf++ = *iso++;
l = 1;
continue;
@@ -536,6 +563,7 @@ LocalToUtf(const unsigned char *iso, unsigned char *utf,
if (l < 0)
break;
+ /* collect coded char of length l */
if (l == 1)
iiso = *iso++;
else if (l == 2)
@@ -562,61 +590,48 @@ LocalToUtf(const unsigned char *iso, unsigned char *utf,
iiso = 0; /* keep compiler quiet */
}
- p = bsearch(&iiso, map, size1,
+ /* First check ordinary map */
+ p = bsearch(&iiso, map, mapsize,
sizeof(pg_local_to_utf), compare2);
- if (p == NULL)
+ if (p)
{
- /*
- * not found in the ordinary map. if there's a combined character
- * map, try with it
- */
- if (cmap)
- {
- cp = bsearch(&iiso, cmap, size2,
- sizeof(pg_local_to_utf_combined), compare4);
+ utf = store_coded_char(utf, p->utf);
+ continue;
+ }
- if (cp)
- {
- if (cp->utf1 & 0xff000000)
- *utf++ = cp->utf1 >> 24;
- if (cp->utf1 & 0x00ff0000)
- *utf++ = (cp->utf1 & 0x00ff0000) >> 16;
- if (cp->utf1 & 0x0000ff00)
- *utf++ = (cp->utf1 & 0x0000ff00) >> 8;
- if (cp->utf1 & 0x000000ff)
- *utf++ = cp->utf1 & 0x000000ff;
-
- if (cp->utf2 & 0xff000000)
- *utf++ = cp->utf2 >> 24;
- if (cp->utf2 & 0x00ff0000)
- *utf++ = (cp->utf2 & 0x00ff0000) >> 16;
- if (cp->utf2 & 0x0000ff00)
- *utf++ = (cp->utf2 & 0x0000ff00) >> 8;
- if (cp->utf2 & 0x000000ff)
- *utf++ = cp->utf2 & 0x000000ff;
+ /* If there's a combined character map, try that */
+ if (cmap)
+ {
+ cp = bsearch(&iiso, cmap, cmapsize,
+ sizeof(pg_local_to_utf_combined), compare4);
- continue;
- }
+ if (cp)
+ {
+ utf = store_coded_char(utf, cp->utf1);
+ utf = store_coded_char(utf, cp->utf2);
+ continue;
}
-
- report_untranslatable_char(encoding, PG_UTF8,
- (const char *) (iso - l), len);
-
}
- else
+
+ /* if there's a conversion function, try that */
+ if (conv_func)
{
- if (p->utf & 0xff000000)
- *utf++ = p->utf >> 24;
- if (p->utf & 0x00ff0000)
- *utf++ = (p->utf & 0x00ff0000) >> 16;
- if (p->utf & 0x0000ff00)
- *utf++ = (p->utf & 0x0000ff00) >> 8;
- if (p->utf & 0x000000ff)
- *utf++ = p->utf & 0x000000ff;
+ uint32 converted = (*conv_func) (iiso);
+
+ if (converted)
+ {
+ utf = store_coded_char(utf, converted);
+ continue;
+ }
}
+
+ /* failed to translate this character */
+ report_untranslatable_char(encoding, PG_UTF8,
+ (const char *) (iso - l), len);
}
+ /* if we broke out of loop early, must be invalid input */
if (len > 0)
report_invalid_encoding(encoding, (const char *) iso, len);