aboutsummaryrefslogtreecommitdiff
path: root/src/backend/utils/mb/conv.c
diff options
context:
space:
mode:
authorTom Lane <tgl@sss.pgh.pa.us>2015-05-14 22:27:07 -0400
committerTom Lane <tgl@sss.pgh.pa.us>2015-05-14 22:27:12 -0400
commit7730f48ede0d222e7f750541d3d5f0f74d75d99b (patch)
tree472b56a394d55b08d31fcbaa1015d2475c788795 /src/backend/utils/mb/conv.c
parent83e176ec18d2a91dbea1d0d1bd94c38dc47cd77c (diff)
downloadpostgresql-7730f48ede0d222e7f750541d3d5f0f74d75d99b.tar.gz
postgresql-7730f48ede0d222e7f750541d3d5f0f74d75d99b.zip
Teach UtfToLocal/LocalToUtf to support algorithmic encoding conversions.
Until now, these functions have only supported encoding conversions using lookup tables, which is fine as long as there's not too many code points to convert. However, GB18030 expects all 1.1 million Unicode code points to be convertible, which would require a ridiculously-sized lookup table. Fortunately, a large fraction of those conversions can be expressed through arithmetic, ie the conversions are one-to-one in certain defined ranges. To support that, provide a callback function that is used after consulting the lookup tables. (This patch doesn't actually change anything about the GB18030 conversion behavior, just provide infrastructure for fixing it.) Since this requires changing the APIs of UtfToLocal/LocalToUtf anyway, take the opportunity to rearrange their argument lists into what seems to me a saner order. And beautify the call sites by using lengthof() instead of error-prone sizeof() arithmetic. In passing, also mark all the lookup tables used by these calls "const". This moves an impressive amount of stuff into the text segment, at least on my machine, and is safer anyhow.
Diffstat (limited to 'src/backend/utils/mb/conv.c')
-rw-r--r--src/backend/utils/mb/conv.c327
1 files changed, 171 insertions, 156 deletions
diff --git a/src/backend/utils/mb/conv.c b/src/backend/utils/mb/conv.c
index d5c5c74a9ce..f957b6efd32 100644
--- a/src/backend/utils/mb/conv.c
+++ b/src/backend/utils/mb/conv.c
@@ -302,47 +302,62 @@ compare4(const void *p1, const void *p2)
}
/*
- * convert 32bit wide character to mutibye stream pointed to by iso
+ * store 32bit character representation into multibyte stream
*/
-static unsigned char *
-set_iso_code(unsigned char *iso, uint32 code)
+static inline unsigned char *
+store_coded_char(unsigned char *dest, uint32 code)
{
if (code & 0xff000000)
- *iso++ = code >> 24;
+ *dest++ = code >> 24;
if (code & 0x00ff0000)
- *iso++ = (code & 0x00ff0000) >> 16;
+ *dest++ = code >> 16;
if (code & 0x0000ff00)
- *iso++ = (code & 0x0000ff00) >> 8;
+ *dest++ = code >> 8;
if (code & 0x000000ff)
- *iso++ = code & 0x000000ff;
- return iso;
+ *dest++ = code;
+ return dest;
}
/*
* UTF8 ---> local code
*
- * utf: input UTF8 string (need not be null-terminated).
+ * utf: input string in UTF8 encoding (need not be null-terminated)
+ * len: length of input string (in bytes)
* iso: pointer to the output area (must be large enough!)
- * map: the conversion map.
- * cmap: the conversion map for combined characters.
- * (optional)
- * size1: the size of the conversion map.
- * size2: the size of the conversion map for combined characters
- * (optional)
- * encoding: the PG identifier for the local encoding.
- * len: length of input string.
+ (output string will be null-terminated)
+ * map: conversion map for single characters
+ * mapsize: number of entries in the conversion map
+ * cmap: conversion map for combined characters
+ * (optional, pass NULL if none)
+ * cmapsize: number of entries in the conversion map for combined characters
+ * (optional, pass 0 if none)
+ * conv_func: algorithmic encoding conversion function
+ * (optional, pass NULL if none)
+ * encoding: PG identifier for the local encoding
+ *
+ * For each character, the cmap (if provided) is consulted first; if no match,
+ * the map is consulted next; if still no match, the conv_func (if provided)
+ * is applied. An error is raised if no match is found.
+ *
+ * See pg_wchar.h for more details about the data structures used here.
*/
void
-UtfToLocal(const unsigned char *utf, unsigned char *iso,
- const pg_utf_to_local *map, const pg_utf_to_local_combined *cmap,
- int size1, int size2, int encoding, int len)
+UtfToLocal(const unsigned char *utf, int len,
+ unsigned char *iso,
+ const pg_utf_to_local *map, int mapsize,
+ const pg_utf_to_local_combined *cmap, int cmapsize,
+ utf_local_conversion_func conv_func,
+ int encoding)
{
uint32 iutf;
- uint32 cutf[2];
- uint32 code;
- pg_utf_to_local *p;
- pg_utf_to_local_combined *cp;
int l;
+ const pg_utf_to_local *p;
+ const pg_utf_to_local_combined *cp;
+
+ if (!PG_VALID_ENCODING(encoding))
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("invalid encoding number: %d", encoding)));
for (; len > 0; len -= l)
{
@@ -351,7 +366,6 @@ UtfToLocal(const unsigned char *utf, unsigned char *iso,
break;
l = pg_utf_mblen(utf);
-
if (len < l)
break;
@@ -360,11 +374,13 @@ UtfToLocal(const unsigned char *utf, unsigned char *iso,
if (l == 1)
{
- /* ASCII case is easy */
+ /* ASCII case is easy, assume it's one-to-one conversion */
*iso++ = *utf++;
continue;
}
- else if (l == 2)
+
+ /* collect coded char of length l */
+ if (l == 2)
{
iutf = *utf++ << 8;
iutf |= *utf++;
@@ -388,15 +404,14 @@ UtfToLocal(const unsigned char *utf, unsigned char *iso,
iutf = 0; /* keep compiler quiet */
}
- /*
- * first, try with combined map if possible
- */
+ /* First, try with combined map if possible */
if (cmap && len > l)
{
const unsigned char *utf_save = utf;
int len_save = len;
int l_save = l;
+ /* collect next character, same as above */
len -= l;
l = pg_utf_mblen(utf);
@@ -406,83 +421,83 @@ UtfToLocal(const unsigned char *utf, unsigned char *iso,
if (!pg_utf8_islegal(utf, l))
break;
- cutf[0] = iutf;
-
- if (l == 1)
+ /* We assume ASCII character cannot be in combined map */
+ if (l > 1)
{
- if (len_save > 1)
+ uint32 iutf2;
+ uint32 cutf[2];
+
+ if (l == 2)
+ {
+ iutf2 = *utf++ << 8;
+ iutf2 |= *utf++;
+ }
+ else if (l == 3)
+ {
+ iutf2 = *utf++ << 16;
+ iutf2 |= *utf++ << 8;
+ iutf2 |= *utf++;
+ }
+ else if (l == 4)
+ {
+ iutf2 = *utf++ << 24;
+ iutf2 |= *utf++ << 16;
+ iutf2 |= *utf++ << 8;
+ iutf2 |= *utf++;
+ }
+ else
{
- p = bsearch(&cutf[0], map, size1,
- sizeof(pg_utf_to_local), compare1);
- if (p == NULL)
- report_untranslatable_char(PG_UTF8, encoding,
- (const char *) (utf_save - l_save), len_save);
- iso = set_iso_code(iso, p->code);
+ elog(ERROR, "unsupported character length %d", l);
+ iutf2 = 0; /* keep compiler quiet */
}
- /* ASCII case is easy */
- *iso++ = *utf++;
- continue;
- }
- else if (l == 2)
- {
- iutf = *utf++ << 8;
- iutf |= *utf++;
- }
- else if (l == 3)
- {
- iutf = *utf++ << 16;
- iutf |= *utf++ << 8;
- iutf |= *utf++;
- }
- else if (l == 4)
- {
- iutf = *utf++ << 24;
- iutf |= *utf++ << 16;
- iutf |= *utf++ << 8;
- iutf |= *utf++;
- }
- else
- {
- elog(ERROR, "unsupported character length %d", l);
- iutf = 0; /* keep compiler quiet */
- }
+ cutf[0] = iutf;
+ cutf[1] = iutf2;
- cutf[1] = iutf;
- cp = bsearch(cutf, cmap, size2,
- sizeof(pg_utf_to_local_combined), compare3);
- if (cp)
- code = cp->code;
- else
- {
- /* not found in combined map. try with ordinary map */
- p = bsearch(&cutf[0], map, size1,
- sizeof(pg_utf_to_local), compare1);
- if (p == NULL)
- report_untranslatable_char(PG_UTF8, encoding,
- (const char *) (utf_save - l_save), len_save);
- iso = set_iso_code(iso, p->code);
-
- p = bsearch(&cutf[1], map, size1,
- sizeof(pg_utf_to_local), compare1);
- if (p == NULL)
- report_untranslatable_char(PG_UTF8, encoding,
- (const char *) (utf - l), len);
- code = p->code;
+ cp = bsearch(cutf, cmap, cmapsize,
+ sizeof(pg_utf_to_local_combined), compare3);
+
+ if (cp)
+ {
+ iso = store_coded_char(iso, cp->code);
+ continue;
+ }
}
+
+ /* fail, so back up to reprocess second character next time */
+ utf = utf_save;
+ len = len_save;
+ l = l_save;
}
- else /* no cmap or no remaining data */
+
+ /* Now check ordinary map */
+ p = bsearch(&iutf, map, mapsize,
+ sizeof(pg_utf_to_local), compare1);
+
+ if (p)
{
- p = bsearch(&iutf, map, size1,
- sizeof(pg_utf_to_local), compare1);
- if (p == NULL)
- report_untranslatable_char(PG_UTF8, encoding,
- (const char *) (utf - l), len);
- code = p->code;
+ iso = store_coded_char(iso, p->code);
+ continue;
+ }
+
+ /* if there's a conversion function, try that */
+ if (conv_func)
+ {
+ uint32 converted = (*conv_func) (iutf);
+
+ if (converted)
+ {
+ iso = store_coded_char(iso, converted);
+ continue;
+ }
}
- iso = set_iso_code(iso, code);
+
+ /* failed to translate this character */
+ report_untranslatable_char(PG_UTF8, encoding,
+ (const char *) (utf - l), len);
}
+ /* if we broke out of loop early, must be invalid input */
if (len > 0)
report_invalid_encoding(PG_UTF8, (const char *) utf, len);
@@ -492,26 +507,38 @@ UtfToLocal(const unsigned char *utf, unsigned char *iso,
/*
* local code ---> UTF8
*
- * iso: input local string (need not be null-terminated).
+ * iso: input string in local encoding (need not be null-terminated)
+ * len: length of input string (in bytes)
* utf: pointer to the output area (must be large enough!)
- * map: the conversion map.
- * cmap: the conversion map for combined characters.
- * (optional)
- * size1: the size of the conversion map.
- * size2: the size of the conversion map for combined characters
- * (optional)
- * encoding: the PG identifier for the local encoding.
- * len: length of input string.
+ (output string will be null-terminated)
+ * map: conversion map for single characters
+ * mapsize: number of entries in the conversion map
+ * cmap: conversion map for combined characters
+ * (optional, pass NULL if none)
+ * cmapsize: number of entries in the conversion map for combined characters
+ * (optional, pass 0 if none)
+ * conv_func: algorithmic encoding conversion function
+ * (optional, pass NULL if none)
+ * encoding: PG identifier for the local encoding
+ *
+ * For each character, the map is consulted first; if no match, the cmap
+ * (if provided) is consulted next; if still no match, the conv_func
+ * (if provided) is applied. An error is raised if no match is found.
+ *
+ * See pg_wchar.h for more details about the data structures used here.
*/
void
-LocalToUtf(const unsigned char *iso, unsigned char *utf,
- const pg_local_to_utf *map, const pg_local_to_utf_combined *cmap,
- int size1, int size2, int encoding, int len)
+LocalToUtf(const unsigned char *iso, int len,
+ unsigned char *utf,
+ const pg_local_to_utf *map, int mapsize,
+ const pg_local_to_utf_combined *cmap, int cmapsize,
+ utf_local_conversion_func conv_func,
+ int encoding)
{
- unsigned int iiso;
+ uint32 iiso;
int l;
- pg_local_to_utf *p;
- pg_local_to_utf_combined *cp;
+ const pg_local_to_utf *p;
+ const pg_local_to_utf_combined *cp;
if (!PG_VALID_ENCODING(encoding))
ereport(ERROR,
@@ -526,7 +553,7 @@ LocalToUtf(const unsigned char *iso, unsigned char *utf,
if (!IS_HIGHBIT_SET(*iso))
{
- /* ASCII case is easy */
+ /* ASCII case is easy, assume it's one-to-one conversion */
*utf++ = *iso++;
l = 1;
continue;
@@ -536,6 +563,7 @@ LocalToUtf(const unsigned char *iso, unsigned char *utf,
if (l < 0)
break;
+ /* collect coded char of length l */
if (l == 1)
iiso = *iso++;
else if (l == 2)
@@ -562,61 +590,48 @@ LocalToUtf(const unsigned char *iso, unsigned char *utf,
iiso = 0; /* keep compiler quiet */
}
- p = bsearch(&iiso, map, size1,
+ /* First check ordinary map */
+ p = bsearch(&iiso, map, mapsize,
sizeof(pg_local_to_utf), compare2);
- if (p == NULL)
+ if (p)
{
- /*
- * not found in the ordinary map. if there's a combined character
- * map, try with it
- */
- if (cmap)
- {
- cp = bsearch(&iiso, cmap, size2,
- sizeof(pg_local_to_utf_combined), compare4);
+ utf = store_coded_char(utf, p->utf);
+ continue;
+ }
- if (cp)
- {
- if (cp->utf1 & 0xff000000)
- *utf++ = cp->utf1 >> 24;
- if (cp->utf1 & 0x00ff0000)
- *utf++ = (cp->utf1 & 0x00ff0000) >> 16;
- if (cp->utf1 & 0x0000ff00)
- *utf++ = (cp->utf1 & 0x0000ff00) >> 8;
- if (cp->utf1 & 0x000000ff)
- *utf++ = cp->utf1 & 0x000000ff;
-
- if (cp->utf2 & 0xff000000)
- *utf++ = cp->utf2 >> 24;
- if (cp->utf2 & 0x00ff0000)
- *utf++ = (cp->utf2 & 0x00ff0000) >> 16;
- if (cp->utf2 & 0x0000ff00)
- *utf++ = (cp->utf2 & 0x0000ff00) >> 8;
- if (cp->utf2 & 0x000000ff)
- *utf++ = cp->utf2 & 0x000000ff;
+ /* If there's a combined character map, try that */
+ if (cmap)
+ {
+ cp = bsearch(&iiso, cmap, cmapsize,
+ sizeof(pg_local_to_utf_combined), compare4);
- continue;
- }
+ if (cp)
+ {
+ utf = store_coded_char(utf, cp->utf1);
+ utf = store_coded_char(utf, cp->utf2);
+ continue;
}
-
- report_untranslatable_char(encoding, PG_UTF8,
- (const char *) (iso - l), len);
-
}
- else
+
+ /* if there's a conversion function, try that */
+ if (conv_func)
{
- if (p->utf & 0xff000000)
- *utf++ = p->utf >> 24;
- if (p->utf & 0x00ff0000)
- *utf++ = (p->utf & 0x00ff0000) >> 16;
- if (p->utf & 0x0000ff00)
- *utf++ = (p->utf & 0x0000ff00) >> 8;
- if (p->utf & 0x000000ff)
- *utf++ = p->utf & 0x000000ff;
+ uint32 converted = (*conv_func) (iiso);
+
+ if (converted)
+ {
+ utf = store_coded_char(utf, converted);
+ continue;
+ }
}
+
+ /* failed to translate this character */
+ report_untranslatable_char(encoding, PG_UTF8,
+ (const char *) (iso - l), len);
}
+ /* if we broke out of loop early, must be invalid input */
if (len > 0)
report_invalid_encoding(encoding, (const char *) iso, len);