aboutsummaryrefslogtreecommitdiff
path: root/src/backend/utils/mb/conv.c
diff options
context:
space:
mode:
authorMarc G. Fournier <scrappy@hub.org>1998-07-24 03:32:46 +0000
committerMarc G. Fournier <scrappy@hub.org>1998-07-24 03:32:46 +0000
commitbf00bbb0c4940b80b46b7e5b379cd64184f2262f (patch)
treebf32bf3bafe6f367ee97249c83afb4c9e9a637af /src/backend/utils/mb/conv.c
parent6e66468f3a160878111578a93be2852635eb4f4d (diff)
downloadpostgresql-bf00bbb0c4940b80b46b7e5b379cd64184f2262f.tar.gz
postgresql-bf00bbb0c4940b80b46b7e5b379cd64184f2262f.zip
I really hope that I haven't missed anything in this one...
From: t-ishii@sra.co.jp Attached are patches to enhance the multi-byte support. (patches are against 7/18 snapshot) * determine encoding at initdb/createdb rather than compile time Now initdb/createdb has an option to specify the encoding. Also, I modified the syntax of CREATE DATABASE to accept encoding option. See README.mb for more details. For this purpose I have added new column "encoding" to pg_database. Also pg_attribute and pg_class are changed to catch up the modification to pg_database. Actually I haved added pg_database_mb.h, pg_attribute_mb.h and pg_class_mb.h. These are used only when MB is enabled. The reason having separate files is I couldn't find a way to use ifdef or whatever in those files. I have to admit it looks ugly. No way. * support for PGCLIENTENCODING when issuing COPY command commands/copy.c modified. * support for SQL92 syntax "SET NAMES" See gram.y. * support for LATIN2-5 * add UNICODE regression test case * new test suite for MB New directory test/mb added. * clean up source files Basic idea is to have MB's own subdirectory for easier maintenance. These are include/mb and backend/utils/mb.
Diffstat (limited to 'src/backend/utils/mb/conv.c')
-rw-r--r--src/backend/utils/mb/conv.c386
1 files changed, 386 insertions, 0 deletions
diff --git a/src/backend/utils/mb/conv.c b/src/backend/utils/mb/conv.c
new file mode 100644
index 00000000000..bca95c5f6b6
--- /dev/null
+++ b/src/backend/utils/mb/conv.c
@@ -0,0 +1,386 @@
+/*
+ * conversion between client encoding and server internal encoding
+ * (currently mule internal code (mic) is used)
+ * Tatsuo Ishii
+ * $Id: conv.c,v 1.1 1998/07/24 03:31:56 scrappy Exp $
+ */
+#include <stdio.h>
+#include <string.h>
+
+#include "mb/pg_wchar.h"
+
+/*
+ * convert bogus chars that cannot be represented in the current encoding
+ * system.
+ */
+static void printBogusChar(unsigned char **mic, unsigned char **p)
+{
+ char strbuf[16];
+ int l = pg_mic_mblen(*mic);
+
+ *(*p)++ = '(';
+ while (l--) {
+ sprintf(strbuf,"%02x",*(*mic)++);
+ *(*p)++ = strbuf[0];
+ *(*p)++ = strbuf[1];
+ }
+ *(*p)++ = ')';
+}
+
+/*
+ * SJIS ---> MIC
+ */
+static void sjis2mic(unsigned char *sjis, unsigned char *p, int len)
+{
+ int c1,c2;
+
+ while (len > 0 && (c1 = *sjis++)) {
+ if (c1 >= 0xa1 && c1 <= 0xdf) { /* 1 byte kana? */
+ len--;
+ *p++ = LC_JISX0201K;
+ *p++ = c1;
+ } else if (c1 > 0x7f) { /* kanji? */
+ c2 = *sjis++;
+ len -= 2;
+ *p++ = LC_JISX0208;
+ *p++ = ((c1 & 0x3f)<<1) + 0x9f + (c2 > 0x9e);
+ *p++ = c2 + ((c2 > 0x9e)? 2 : 0x60) + (c2 < 0x80);
+ } else { /* should be ASCII */
+ len--;
+ *p++ = c1;
+ }
+ }
+ *p = '\0';
+}
+
+/*
+ * MIC ---> SJIS
+ */
+static void mic2sjis(unsigned char *mic, unsigned char *p, int len)
+{
+ int c1,c2;
+
+ while (len > 0 && (c1 = *mic)) {
+ len -= pg_mic_mblen(mic++);
+
+ if (c1 == LC_JISX0201K) {
+ *p++ = *mic++;
+ } else if (c1 == LC_JISX0208) {
+ c1 = *mic++;
+ c2 = *mic++;
+ *p++ = ((c1 - 0xa1)>>1) + ((c1 < 0xdf)? 0x81 : 0xc1);
+ *p++ = c2 - ((c1 & 1)? ((c2 < 0xe0)? 0x61 : 0x60) : 2);
+ } else if (c1 > 0x7f) { /* cannot convert to SJIS! */
+ mic--;
+ printBogusChar(&mic, &p);
+ } else { /* should be ASCII */
+ *p++ = c1;
+ }
+ }
+ *p = '\0';
+}
+
+/*
+ * EUC_JP ---> MIC
+ */
+static void euc_jp2mic(unsigned char *euc, unsigned char *p, int len)
+{
+ int c1;
+
+ while (len > 0 && (c1 = *euc++)) {
+ if (c1 == SS2) { /* 1 byte kana? */
+ len -= 2;
+ *p++ = LC_JISX0201K;
+ *p++ = *euc++;
+ } else if (c1 == SS3) { /* JIS X0212 kanji? */
+ len -= 3;
+ *p++ = LC_JISX0212;
+ *p++ = *euc++;
+ *p++ = *euc++;
+ } else if (c1 & 0x80) { /* kanji? */
+ len -= 2;
+ *p++ = LC_JISX0208;
+ *p++ = c1;
+ *p++ = *euc++;
+ } else { /* should be ASCII */
+ len--;
+ *p++ = c1;
+ }
+ }
+ *p = '\0';
+}
+
+/*
+ * MIC ---> EUC_JP
+ */
+static void mic2euc_jp(unsigned char *mic, unsigned char *p, int len)
+{
+ int c1;
+
+ while (len > 0 && (c1 = *mic)) {
+ len -= pg_mic_mblen(mic++);
+
+ if (c1 == LC_JISX0201K) {
+ *p++ = SS2;
+ *p++ = *mic++;
+ } else if (c1 == LC_JISX0212) {
+ *p++ = SS3;
+ *p++ = *mic++;
+ *p++ = *mic++;
+ } else if (c1 == LC_JISX0208) {
+ *p++ = *mic++;
+ *p++ = *mic++;
+ } else if (c1 > 0x7f) { /* cannot convert to EUC_JP! */
+ mic--;
+ printBogusChar(&mic, &p);
+ } else { /* should be ASCII */
+ *p++ = c1;
+ }
+ }
+ *p = '\0';
+}
+
+/*
+ * EUC_KR ---> MIC
+ */
+static void euc_kr2mic(unsigned char *euc, unsigned char *p, int len)
+{
+ int c1;
+
+ while (len > 0 && (c1 = *euc++)) {
+ if (c1 & 0x80) {
+ len -= 2;
+ *p++ = LC_KS5601;
+ *p++ = c1;
+ *p++ = *euc++;
+ } else { /* should be ASCII */
+ len--;
+ *p++ = c1;
+ }
+ }
+ *p = '\0';
+}
+
+/*
+ * MIC ---> EUC_KR
+ */
+static void mic2euc_kr(unsigned char *mic, unsigned char *p, int len)
+{
+ int c1;
+
+ while (len > 0 && (c1 = *mic)) {
+ len -= pg_mic_mblen(mic++);
+
+ if (c1 == LC_KS5601) {
+ *p++ = *mic++;
+ *p++ = *mic++;
+ } else if (c1 > 0x7f) { /* cannot convert to EUC_KR! */
+ mic--;
+ printBogusChar(&mic, &p);
+ } else { /* should be ASCII */
+ *p++ = c1;
+ }
+ }
+ *p = '\0';
+}
+
+/*
+ * EUC_CN ---> MIC
+ */
+static void euc_cn2mic(unsigned char *euc, unsigned char *p, int len)
+{
+ int c1;
+
+ while (len > 0 && (c1 = *euc++)) {
+ if (c1 & 0x80) {
+ len -= 2;
+ *p++ = LC_GB2312_80;
+ *p++ = c1;
+ *p++ = *euc++;
+ } else { /* should be ASCII */
+ len--;
+ *p++ = c1;
+ }
+ }
+ *p = '\0';
+}
+
+/*
+ * MIC ---> EUC_CN
+ */
+static void mic2euc_cn(unsigned char *mic, unsigned char *p, int len)
+{
+ int c1;
+
+ while (len > 0 && (c1 = *mic)) {
+ len -= pg_mic_mblen(mic++);
+
+ if (c1 == LC_GB2312_80) {
+ *p++ = *mic++;
+ *p++ = *mic++;
+ } else if (c1 > 0x7f) { /* cannot convert to EUC_CN! */
+ mic--;
+ printBogusChar(&mic, &p);
+ } else { /* should be ASCII */
+ *p++ = c1;
+ }
+ }
+ *p = '\0';
+}
+
+/*
+ * EUC_TW ---> MIC
+ */
+static void euc_tw2mic(unsigned char *euc, unsigned char *p, int len)
+{
+ int c1;
+
+ while (len > 0 && (c1 = *euc++)) {
+ if (c1 == SS2) {
+ len -= 4;
+ c1 = *euc++; /* plane No. */
+ if (c1 == 0xa1) {
+ *p++ = LC_CNS11643_1;
+ } else if (c1 == 0xa2) {
+ *p++ = LC_CNS11643_2;
+ } else {
+ *p++ = 0x9d; /* LCPRV2 */
+ *p++ = 0xa3 - c1 + LC_CNS11643_3;
+ }
+ *p++ = *euc++;
+ *p++ = *euc++;
+ } else if (c1 & 0x80) { /* CNS11643-1 */
+ len -= 2;
+ *p++ = LC_CNS11643_1;
+ *p++ = c1;
+ *p++ = *euc++;
+ } else { /* should be ASCII */
+ len --;
+ *p++ = c1;
+ }
+ }
+ *p = '\0';
+}
+
+/*
+ * MIC ---> EUC_TW
+ */
+static void mic2euc_tw(unsigned char *mic, unsigned char *p, int len)
+{
+ int c1;
+
+ while (len > 0 && (c1 = *mic)) {
+ len -= pg_mic_mblen(mic++);
+
+ if (c1 == LC_CNS11643_1 || c1 == LC_CNS11643_2) {
+ *p++ = *mic++;
+ *p++ = *mic++;
+ } else if (c1 == 0x9d) { /* LCPRV2? */
+ *p++ = SS2;
+ *p++ = c1 - LC_CNS11643_3 + 0xa3;
+ *p++ = *mic++;
+ *p++ = *mic++;
+ } else if (c1 > 0x7f) { /* cannot convert to EUC_TW! */
+ mic--;
+ printBogusChar(&mic, &p);
+ } else { /* should be ASCII */
+ *p++ = c1;
+ }
+ }
+ *p = '\0';
+}
+
+/*
+ * LATINn ---> MIC
+ */
+static void latin2mic(unsigned char *l, unsigned char *p, int len, int lc)
+{
+ int c1;
+
+ while (len-- > 0 && (c1 = *l++)) {
+ if (c1 > 0x7f) { /* Latin1? */
+ *p++ = lc;
+ }
+ *p++ = c1;
+ }
+ *p = '\0';
+}
+
+/*
+ * MIC ---> LATINn
+ */
+static void mic2latin(unsigned char *mic, unsigned char *p, int len, int lc)
+{
+ int c1;
+
+ while (len > 0 && (c1 = *mic)) {
+ len -= pg_mic_mblen(mic++);
+
+ if (c1 == lc) {
+ *p++ = *mic++;
+ } else if (c1 > 0x7f) {
+ mic--;
+ printBogusChar(&mic, &p);
+ } else { /* should be ASCII */
+ *p++ = c1;
+ }
+ }
+ *p = '\0';
+}
+
+static void latin12mic(unsigned char *l, unsigned char *p, int len)
+{
+ latin2mic(l, p, len, LC_ISO8859_1);
+}
+static void mic2latin1(unsigned char *mic, unsigned char *p, int len)
+{
+ mic2latin(mic, p, len, LC_ISO8859_1);
+}
+static void latin22mic(unsigned char *l, unsigned char *p, int len)
+{
+ latin2mic(l, p, len, LC_ISO8859_2);
+}
+static void mic2latin2(unsigned char *mic, unsigned char *p, int len)
+{
+ mic2latin(mic, p, len, LC_ISO8859_2);
+}
+static void latin32mic(unsigned char *l, unsigned char *p, int len)
+{
+ latin2mic(l, p, len, LC_ISO8859_3);
+}
+static void mic2latin3(unsigned char *mic, unsigned char *p, int len)
+{
+ mic2latin(mic, p, len, LC_ISO8859_3);
+}
+static void latin42mic(unsigned char *l, unsigned char *p, int len)
+{
+ latin2mic(l, p, len, LC_ISO8859_4);
+}
+static void mic2latin4(unsigned char *mic, unsigned char *p, int len)
+{
+ mic2latin(mic, p, len, LC_ISO8859_4);
+}
+static void latin52mic(unsigned char *l, unsigned char *p, int len)
+{
+ latin2mic(l, p, len, LC_ISO8859_5);
+}
+static void mic2latin5(unsigned char *mic, unsigned char *p, int len)
+{
+ mic2latin(mic, p, len, LC_ISO8859_5);
+}
+
+pg_encoding_conv_tbl pg_conv_tbl[] = {
+ {EUC_JP, "EUC_JP", 0, euc_jp2mic, mic2euc_jp}, /* EUC_JP */
+ {EUC_CN, "EUC_CN", 0, euc_cn2mic, mic2euc_cn}, /* EUC_CN */
+ {EUC_KR, "EUC_KR", 0, euc_kr2mic, mic2euc_kr}, /* EUC_KR */
+ {EUC_TW, "EUC_TW", 0, euc_tw2mic, mic2euc_tw}, /* EUC_TW */
+ {UNICODE, "UNICODE", 0, 0, 0}, /* UNICODE */
+ {MULE_INTERNAL, "MULE_INTERNAL", 0, 0, 0}, /* MULE_INTERNAL */
+ {LATIN1, "LATIN1", 0, latin12mic, mic2latin1}, /* ISO 8859 Latin 1 */
+ {LATIN2, "LATIN2", 0, latin22mic, mic2latin2}, /* ISO 8859 Latin 2 */
+ {LATIN3, "LATIN3", 0, latin32mic, mic2latin3}, /* ISO 8859 Latin 3 */
+ {LATIN4, "LATIN4", 0, latin42mic, mic2latin4}, /* ISO 8859 Latin 4 */
+ {LATIN5, "LATIN5", 0, latin52mic, mic2latin5}, /* ISO 8859 Latin 5 */
+ {SJIS, "SJIS", 1, sjis2mic, mic2sjis}, /* SJIS */
+ {-1, "", 0, 0, 0} /* end mark */
+};