aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/backend/utils/mb/Unicode/Makefile22
-rwxr-xr-xsrc/backend/utils/mb/Unicode/UCS_to_BIG5.pl184
-rwxr-xr-xsrc/backend/utils/mb/Unicode/UCS_to_EUC_CN.pl154
-rwxr-xr-xsrc/backend/utils/mb/Unicode/UCS_to_EUC_JIS_2004.pl272
-rwxr-xr-xsrc/backend/utils/mb/Unicode/UCS_to_EUC_JP.pl398
-rwxr-xr-xsrc/backend/utils/mb/Unicode/UCS_to_EUC_KR.pl115
-rwxr-xr-xsrc/backend/utils/mb/Unicode/UCS_to_EUC_TW.pl142
-rwxr-xr-xsrc/backend/utils/mb/Unicode/UCS_to_GB18030.pl80
-rwxr-xr-xsrc/backend/utils/mb/Unicode/UCS_to_JOHAB.pl31
-rwxr-xr-xsrc/backend/utils/mb/Unicode/UCS_to_SHIFT_JIS_2004.pl205
-rwxr-xr-xsrc/backend/utils/mb/Unicode/UCS_to_SJIS.pl157
-rwxr-xr-xsrc/backend/utils/mb/Unicode/UCS_to_UHC.pl51
-rwxr-xr-xsrc/backend/utils/mb/Unicode/UCS_to_most.pl116
-rw-r--r--src/backend/utils/mb/Unicode/convutils.pm282
-rw-r--r--src/backend/utils/mb/Unicode/euc_jis_2004_to_utf8.map9
-rw-r--r--src/backend/utils/mb/Unicode/euc_jis_2004_to_utf8_combined.map7
-rw-r--r--src/backend/utils/mb/Unicode/euc_jp_to_utf8.map4
-rw-r--r--src/backend/utils/mb/Unicode/euc_kr_to_utf8.map2
-rw-r--r--src/backend/utils/mb/Unicode/johab_to_utf8.map2
-rw-r--r--src/backend/utils/mb/Unicode/shift_jis_2004_to_utf8.map9
-rw-r--r--src/backend/utils/mb/Unicode/shift_jis_2004_to_utf8_combined.map7
-rw-r--r--src/backend/utils/mb/Unicode/ucs2utf.pl35
-rw-r--r--src/backend/utils/mb/Unicode/uhc_to_utf8.map2
-rw-r--r--src/backend/utils/mb/Unicode/utf8_to_euc_cn.map2
-rw-r--r--src/backend/utils/mb/Unicode/utf8_to_euc_jis_2004.map9
-rw-r--r--src/backend/utils/mb/Unicode/utf8_to_euc_jis_2004_combined.map7
-rw-r--r--src/backend/utils/mb/Unicode/utf8_to_euc_jp.map2
-rw-r--r--src/backend/utils/mb/Unicode/utf8_to_euc_kr.map2
-rw-r--r--src/backend/utils/mb/Unicode/utf8_to_johab.map2
-rw-r--r--src/backend/utils/mb/Unicode/utf8_to_shift_jis_2004.map9
-rw-r--r--src/backend/utils/mb/Unicode/utf8_to_shift_jis_2004_combined.map7
-rw-r--r--src/backend/utils/mb/Unicode/utf8_to_sjis.map4
-rw-r--r--src/backend/utils/mb/Unicode/utf8_to_uhc.map2
33 files changed, 791 insertions, 1541 deletions
diff --git a/src/backend/utils/mb/Unicode/Makefile b/src/backend/utils/mb/Unicode/Makefile
index 9d2ef5e3d22..ea21f4a8527 100644
--- a/src/backend/utils/mb/Unicode/Makefile
+++ b/src/backend/utils/mb/Unicode/Makefile
@@ -39,8 +39,6 @@ WINMAPS = win866_to_utf8.map utf8_to_win866.map \
win1258_to_utf8.map utf8_to_win1258.map
GENERICMAPS = $(ISO8859MAPS) $(WINMAPS) \
- johab_to_utf8.map utf8_to_johab.map \
- uhc_to_utf8.map utf8_to_uhc.map \
gbk_to_utf8.map utf8_to_gbk.map \
koi8r_to_utf8.map utf8_to_koi8r.map
@@ -51,6 +49,8 @@ SPECIALMAPS = euc_cn_to_utf8.map utf8_to_euc_cn.map \
sjis_to_utf8.map utf8_to_sjis.map \
gb18030_to_utf8.map utf8_to_gb18030.map \
big5_to_utf8.map utf8_to_big5.map \
+ johab_to_utf8.map utf8_to_johab.map \
+ uhc_to_utf8.map utf8_to_uhc.map \
euc_jis_2004_to_utf8.map euc_jis_2004_to_utf8_combined.map \
utf8_to_euc_jis_2004.map utf8_to_euc_jis_2004_combined.map \
shift_jis_2004_to_utf8.map shift_jis_2004_to_utf8_combined.map \
@@ -63,23 +63,29 @@ ISO8859TEXTS = 8859-2.TXT 8859-3.TXT 8859-4.TXT 8859-5.TXT \
8859-10.TXT 8859-13.TXT 8859-14.TXT 8859-15.TXT \
8859-16.TXT
-WINTEXTS = CP866.TXT CP874.TXT CP936.TXT CP949.TXT \
+WINTEXTS = CP866.TXT CP874.TXT CP936.TXT \
CP1250.TXT CP1251.TXT \
CP1252.TXT CP1253.TXT CP1254.TXT CP1255.TXT \
CP1256.TXT CP1257.TXT CP1258.TXT
GENERICTEXTS = $(ISO8859TEXTS) $(WINTEXTS) \
- KOI8-R.TXT KOI8-U.TXT JOHAB.TXT
+ KOI8-R.TXT KOI8-U.TXT
all: $(MAPS)
$(GENERICMAPS): UCS_to_most.pl $(GENERICTEXTS)
$(PERL) $<
-euc_jp_to_utf8.map utf8_to_euc_jp.map: UCS_to_EUC_JP.pl JIS0201.TXT JIS0208.TXT JIS0212.TXT
+johab_to_utf8.map utf8_to_johab.map: UCS_to_JOHAB.pl JOHAB.TXT
+ $(PERL) $<
+
+uhc_to_utf8.map utf8_to_uhc.map: UCS_to_UHC.pl windows-949-2000.xml
+ $(PERL) $<
+
+euc_jp_to_utf8.map utf8_to_euc_jp.map: UCS_to_EUC_JP.pl CP932.TXT JIS0212.TXT
$(PERL) $<
-euc_cn_to_utf8.map utf8_to_euc_cn.map: UCS_to_EUC_CN.pl GB2312.TXT
+euc_cn_to_utf8.map utf8_to_euc_cn.map: UCS_to_EUC_CN.pl gb-18030-2000.xml
$(PERL) $<
euc_kr_to_utf8.map utf8_to_euc_kr.map: UCS_to_EUC_KR.pl KSX1001.TXT
@@ -119,7 +125,7 @@ BIG5.TXT CNS11643.TXT:
euc-jis-2004-std.txt sjis-0213-2004-std.txt:
$(DOWNLOAD) http://x0213.org/codetable/$(@F)
-gb-18030-2000.xml:
+gb-18030-2000.xml windows-949-2000.xml:
$(DOWNLOAD) https://ssl.icu-project.org/repos/icu/data/trunk/charset/data/xml/$(@F)
GB2312.TXT:
@@ -137,7 +143,7 @@ KOI8-R.TXT KOI8-U.TXT:
$(ISO8859TEXTS):
$(DOWNLOAD) http://ftp.unicode.org/Public/MAPPINGS/ISO8859/$(@F)
-$(filter-out CP8%,$(WINTEXTS)):
+$(filter-out CP8%,$(WINTEXTS)) CP932.TXT CP950.TXT:
$(DOWNLOAD) http://ftp.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/$(@F)
$(filter CP8%,$(WINTEXTS)):
diff --git a/src/backend/utils/mb/Unicode/UCS_to_BIG5.pl b/src/backend/utils/mb/Unicode/UCS_to_BIG5.pl
index 127fd157b07..6a1321bab84 100755
--- a/src/backend/utils/mb/Unicode/UCS_to_BIG5.pl
+++ b/src/backend/utils/mb/Unicode/UCS_to_BIG5.pl
@@ -25,56 +25,17 @@
# # and Unicode name (not used in this script)
-require "ucs2utf.pl";
+require "convutils.pm";
+# Load BIG5.TXT
+my $all = &read_source("BIG5.TXT");
-#
-# first, generate UTF8 --> BIG5 table
-#
-$in_file = "BIG5.TXT";
-
-open(FILE, $in_file) || die("cannot open $in_file");
-
-reset 'array';
+# Load CP950.TXT
+my $cp950txt = &read_source("CP950.TXT");
-while (<FILE>)
-{
- chop;
- if (/^#/)
- {
- next;
- }
- ($c, $u, $rest) = split;
- $ucs = hex($u);
- $code = hex($c);
- if ($code >= 0x80 && $ucs >= 0x0080)
- {
- $utf = &ucs2utf($ucs);
- if ($array{$utf} ne "")
- {
- printf STDERR "Warning: duplicate UTF8: %04x\n", $ucs;
- next;
- }
- $count++;
- $array{$utf} = $code;
- }
-}
-close(FILE);
-
-$in_file = "CP950.TXT";
-
-open(FILE, $in_file) || die("cannot open $in_file");
-
-while (<FILE>)
-{
- chop;
- if (/^#/)
- {
- next;
- }
- ($c, $u, $rest) = split;
- $ucs = hex($u);
- $code = hex($c);
+foreach my $i (@$cp950txt) {
+ my $code = $i->{code};
+ my $ucs = $i->{ucs};
# Pick only the ETEN extended characters in the range 0xf9d6 - 0xf9dc
# from CP950.TXT
@@ -83,126 +44,25 @@ while (<FILE>)
&& $code >= 0xf9d6
&& $code <= 0xf9dc)
{
- $utf = &ucs2utf($ucs);
- if ($array{$utf} ne "")
- {
- printf STDERR "Warning: duplicate UTF8: %04x\n", $ucs;
- next;
- }
- $count++;
- $array{$utf} = $code;
+ push @$all, {code => $code,
+ ucs => $ucs,
+ comment => $i->{comment},
+ direction => "both"};
}
}
-close(FILE);
-
-$file = lc("utf8_to_big5.map");
-open(FILE, "> $file") || die("cannot open $file");
-
-print FILE "/* src/backend/utils/mb/Unicode/$file */\n\n";
-print FILE "static const pg_utf_to_local ULmapBIG5[ $count ] = {\n";
-
-for $index (sort { $a <=> $b } keys(%array))
-{
- $code = $array{$index};
- $count--;
- if ($count == 0)
- {
- printf FILE " {0x%04x, 0x%04x}\n", $index, $code;
- }
- else
- {
- printf FILE " {0x%04x, 0x%04x},\n", $index, $code;
- }
-}
-
-print FILE "};\n";
-close(FILE);
-
-#
-# then generate BIG5 --> UTF8 table
-#
-$in_file = "BIG5.TXT";
-open(FILE, $in_file) || die("cannot open $in_file");
+foreach my $i (@$all) {
+ my $code = $i->{code};
+ my $ucs = $i->{ucs};
-reset 'array';
-
-while (<FILE>)
-{
- chop;
- if (/^#/)
- {
- next;
- }
- ($c, $u, $rest) = split;
- $ucs = hex($u);
- $code = hex($c);
- if ($code >= 0x80 && $ucs >= 0x0080)
- {
- $utf = &ucs2utf($ucs);
- if ($array{$utf} ne "")
- {
- printf STDERR "Warning: duplicate UTF8: %04x\n", $ucs;
- next;
- }
- $count++;
- $array{$code} = $utf;
- }
-}
-close(FILE);
-
-$in_file = "CP950.TXT";
-
-open(FILE, $in_file) || die("cannot open $in_file");
-
-while (<FILE>)
-{
- chop;
- if (/^#/)
- {
- next;
- }
- ($c, $u, $rest) = split;
- $ucs = hex($u);
- $code = hex($c);
-
- # Pick only the ETEN extended characters in the range 0xf9d6 - 0xf9dc
- # from CP950.TXT
- if ( $code >= 0x80
- && $ucs >= 0x0080
- && $code >= 0xf9d6
- && $code <= 0xf9dc)
- {
- $utf = &ucs2utf($ucs);
- if ($array{$utf} ne "")
- {
- printf STDERR "Warning: duplicate UTF8: %04x\n", $ucs;
- next;
- }
- $count++;
- $array{$code} = $utf;
- }
-}
-close(FILE);
-
-$file = lc("big5_to_utf8.map");
-open(FILE, "> $file") || die("cannot open $file");
-
-print FILE "/* src/backend/utils/mb/Unicode/$file */\n\n";
-print FILE "static const pg_local_to_utf LUmapBIG5[ $count ] = {\n";
-for $index (sort { $a <=> $b } keys(%array))
-{
- $utf = $array{$index};
- $count--;
- if ($count == 0)
- {
- printf FILE " {0x%04x, 0x%04x}\n", $index, $utf;
- }
- else
+ # BIG5.TXT maps several BIG5 characters to U+FFFD. The UTF-8 to BIG5 mapping can
+ # contain only one of them. XXX: Doesn't really make sense to include any of them,
+ # but for historical reasons, we map the first one of them.
+ if ($i->{ucs} == 0xFFFD && $i->{code} != 0xA15A)
{
- printf FILE " {0x%04x, 0x%04x},\n", $index, $utf;
+ $i->{direction} = "to_unicode";
}
}
-print FILE "};\n";
-close(FILE);
+# Output
+print_tables("BIG5", $all);
diff --git a/src/backend/utils/mb/Unicode/UCS_to_EUC_CN.pl b/src/backend/utils/mb/Unicode/UCS_to_EUC_CN.pl
index 53f44773c93..8df23f8be65 100755
--- a/src/backend/utils/mb/Unicode/UCS_to_EUC_CN.pl
+++ b/src/backend/utils/mb/Unicode/UCS_to_EUC_CN.pl
@@ -1,128 +1,76 @@
#! /usr/bin/perl
#
-# Copyright (c) 2001-2016, PostgreSQL Global Development Group
+# Copyright (c) 2007-2016, PostgreSQL Global Development Group
#
-# src/backend/utils/mb/Unicode/UCS_to_EUC_CN.pl
+# src/backend/utils/mb/Unicode/UCS_to_GB18030.pl
#
-# Generate UTF-8 <--> EUC_CN code conversion tables from
-# map files provided by Unicode organization.
-# Unfortunately it is prohibited by the organization
-# to distribute the map files. So if you try to use this script,
-# you have to obtain GB2312.TXT from
-# the organization's ftp site.
+# Generate UTF-8 <--> GB18030 code conversion tables from
+# "gb-18030-2000.xml", obtained from
+# http://source.icu-project.org/repos/icu/data/trunk/charset/data/xml/
#
-# GB2312.TXT format:
-# GB2312 code in hex
-# UCS-2 code in hex
-# # and Unicode name (not used in this script)
+# The lines we care about in the source file look like
+# <a u="009A" b="81 30 83 36"/>
+# where the "u" field is the Unicode code point in hex,
+# and the "b" field is the hex byte sequence for GB18030
-require "ucs2utf.pl";
+require "convutils.pm";
-# first generate UTF-8 --> EUC_CN table
+# Read the input
-$in_file = "GB2312.TXT";
+$in_file = "gb-18030-2000.xml";
open(FILE, $in_file) || die("cannot open $in_file");
+my @mapping;
+
while (<FILE>)
{
- chop;
- if (/^#/)
- {
- next;
- }
- ($c, $u, $rest) = split;
+ next if (!m/<a u="([0-9A-F]+)" b="([0-9A-F ]+)"/);
+ $u = $1;
+ $c = $2;
+ $c =~ s/ //g;
$ucs = hex($u);
$code = hex($c);
- if ($code >= 0x80 && $ucs >= 0x0080)
- {
- $utf = &ucs2utf($ucs);
- if ($array{$utf} ne "")
- {
- printf STDERR "Warning: duplicate UTF8: %04x\n", $ucs;
- next;
- }
- $count++;
-
- $array{$utf} = ($code | 0x8080);
- }
-}
-close(FILE);
-
-$file = "utf8_to_euc_cn.map";
-open(FILE, "> $file") || die("cannot open $file");
-print FILE "/* src/backend/utils/mb/Unicode/$file */\n\n";
-print FILE "static const pg_utf_to_local ULmapEUC_CN[ $count ] = {\n";
-
-for $index (sort { $a <=> $b } keys(%array))
-{
- $code = $array{$index};
- $count--;
- if ($count == 0)
+ # The GB-18030 character set, which we use as the source, contains
+ # a lot of extra characters on top of the GB2312 character set that
+ # EUC_CN encodes. Filter out those extra characters.
+ next if (($code & 0xFF) < 0xA1);
+ next if (!($code >= 0xA100 && $code <= 0xA9FF ||
+ $code >= 0xB000 && $code <= 0xF7FF));
+
+ next if ($code >= 0xA2A1 && $code <= 0xA2B0);
+ next if ($code >= 0xA2E3 && $code <= 0xA2E4);
+ next if ($code >= 0xA2EF && $code <= 0xA2F0);
+ next if ($code >= 0xA2FD && $code <= 0xA2FE);
+ next if ($code >= 0xA4F4 && $code <= 0xA4FE);
+ next if ($code >= 0xA5F7 && $code <= 0xA5FE);
+ next if ($code >= 0xA6B9 && $code <= 0xA6C0);
+ next if ($code >= 0xA6D9 && $code <= 0xA6FE);
+ next if ($code >= 0xA7C2 && $code <= 0xA7D0);
+ next if ($code >= 0xA7F2 && $code <= 0xA7FE);
+ next if ($code >= 0xA8BB && $code <= 0xA8C4);
+ next if ($code >= 0xA8EA && $code <= 0xA8FE);
+ next if ($code >= 0xA9A1 && $code <= 0xA9A3);
+ next if ($code >= 0xA9F0 && $code <= 0xA9FE);
+ next if ($code >= 0xD7FA && $code <= 0xD7FE);
+
+ # A couple of characters are mapped differently from GB-2312 or GB-18030
+ if ($code == 0xA1A4)
{
- printf FILE " {0x%04x, 0x%04x}\n", $index, $code;
+ $ucs = 0x30FB;
}
- else
- {
- printf FILE " {0x%04x, 0x%04x},\n", $index, $code;
- }
-}
-
-print FILE "};\n";
-close(FILE);
-
-#
-# then generate EUC_CN --> UTF8 table
-#
-reset 'array';
-
-open(FILE, $in_file) || die("cannot open $in_file");
-
-while (<FILE>)
-{
- chop;
- if (/^#/)
+ if ($code == 0xA1AA)
{
- next;
+ $ucs = 0x2015;
}
- ($c, $u, $rest) = split;
- $ucs = hex($u);
- $code = hex($c);
- if ($code >= 0x80 && $ucs >= 0x0080)
- {
- $utf = &ucs2utf($ucs);
- if ($array{$code} ne "")
- {
- printf STDERR "Warning: duplicate code: %04x\n", $ucs;
- next;
- }
- $count++;
- $code |= 0x8080;
- $array{$code} = $utf;
+ push @mapping, {
+ ucs => $ucs,
+ code => $code,
+ direction => 'both'
}
}
close(FILE);
-$file = "euc_cn_to_utf8.map";
-open(FILE, "> $file") || die("cannot open $file");
-
-print FILE "/* src/backend/utils/mb/Unicode/$file */\n\n";
-print FILE "static const pg_local_to_utf LUmapEUC_CN[ $count ] = {\n";
-for $index (sort { $a <=> $b } keys(%array))
-{
- $utf = $array{$index};
- $count--;
- if ($count == 0)
- {
- printf FILE " {0x%04x, 0x%04x}\n", $index, $utf;
- }
- else
- {
- printf FILE " {0x%04x, 0x%04x},\n", $index, $utf;
- }
-}
-
-print FILE "};\n";
-close(FILE);
+print_tables("EUC_CN", \@mapping);
diff --git a/src/backend/utils/mb/Unicode/UCS_to_EUC_JIS_2004.pl b/src/backend/utils/mb/Unicode/UCS_to_EUC_JIS_2004.pl
index d2f1b757cb3..b4e140b657c 100755
--- a/src/backend/utils/mb/Unicode/UCS_to_EUC_JIS_2004.pl
+++ b/src/backend/utils/mb/Unicode/UCS_to_EUC_JIS_2004.pl
@@ -7,9 +7,7 @@
# Generate UTF-8 <--> EUC_JIS_2004 code conversion tables from
# "euc-jis-2004-std.txt" (http://x0213.org)
-require "ucs2utf.pl";
-
-$TEST = 0;
+require "convutils.pm";
# first generate UTF-8 --> EUC_JIS_2004 table
@@ -17,10 +15,7 @@ $in_file = "euc-jis-2004-std.txt";
open(FILE, $in_file) || die("cannot open $in_file");
-reset 'array';
-reset 'array1';
-reset 'comment';
-reset 'comment1';
+my @all;
while ($line = <FILE>)
{
@@ -31,14 +26,14 @@ while ($line = <FILE>)
$u2 = $3;
$rest = "U+" . $u1 . "+" . $u2 . $4;
$code = hex($c);
- $ucs = hex($u1);
- $utf1 = &ucs2utf($ucs);
- $ucs = hex($u2);
- $utf2 = &ucs2utf($ucs);
- $str = sprintf "%08x%08x", $utf1, $utf2;
- $array1{$str} = $code;
- $comment1{$str} = $rest;
- $count1++;
+ $ucs1 = hex($u1);
+ $ucs2 = hex($u2);
+
+ push @all, { direction => 'both',
+ ucs => $ucs1,
+ ucs_second => $ucs2,
+ code => $code,
+ comment => $rest };
next;
}
elsif ($line =~ /^0x(.*)[ \t]*U\+(.*)[ \t]*#(.*)$/)
@@ -54,252 +49,11 @@ while ($line = <FILE>)
$ucs = hex($u);
$code = hex($c);
- $utf = &ucs2utf($ucs);
- if ($array{$utf} ne "")
- {
- printf STDERR "Warning: duplicate UTF8: %04x\n", $ucs;
- next;
- }
- $count++;
-
- $array{$utf} = $code;
- $comment{$code} = $rest;
-}
-close(FILE);
-
-$file = "utf8_to_euc_jis_2004.map";
-open(FILE, "> $file") || die("cannot open $file");
-print FILE "/*\n";
-print FILE " * This file was generated by UCS_to_EUC_JIS_2004.pl\n";
-print FILE " */\n";
-print FILE "static const pg_utf_to_local ULmapEUC_JIS_2004[] = {\n";
-
-for $index (sort { $a <=> $b } keys(%array))
-{
- $code = $array{$index};
- $count--;
- if ($count == 0)
- {
- printf FILE " {0x%08x, 0x%06x} /* %s */\n", $index, $code,
- $comment{$code};
- }
- else
- {
- printf FILE " {0x%08x, 0x%06x}, /* %s */\n", $index, $code,
- $comment{$code};
- }
-}
-
-print FILE "};\n";
-close(FILE);
-
-if ($TEST == 1)
-{
- $file1 = "utf8.data";
- $file2 = "euc_jis_2004.data";
- open(FILE1, "> $file1") || die("cannot open $file1");
- open(FILE2, "> $file2") || die("cannot open $file2");
-
- for $index (sort { $a <=> $b } keys(%array))
- {
- $code = $array{$index};
- if ( $code > 0x00
- && $code != 0x09
- && $code != 0x0a
- && $code != 0x0d
- && $code != 0x5c
- && ( $code < 0x80
- || ($code >= 0x8ea1 && $code <= 0x8efe)
- || ($code >= 0x8fa1a1 && $code <= 0x8ffefe)
- || ($code >= 0xa1a1 && $code <= 0x8fefe)))
- {
- for ($i = 3; $i >= 0; $i--)
- {
- $s = $i * 8;
- $mask = 0xff << $s;
- print FILE1 pack("C", ($index & $mask) >> $s)
- if $index & $mask;
- print FILE2 pack("C", ($code & $mask) >> $s) if $code & $mask;
- }
- print FILE1 "\n";
- print FILE2 "\n";
- }
- }
-}
-$file = "utf8_to_euc_jis_2004_combined.map";
-open(FILE, "> $file") || die("cannot open $file");
-print FILE "/*\n";
-print FILE " * This file was generated by UCS_to_EUC_JIS_2004.pl\n";
-print FILE " */\n";
-print FILE
- "static const pg_utf_to_local_combined ULmapEUC_JIS_2004_combined[] = {\n";
+ next if ($code < 0x80 && $ucs < 0x80);
-for $index (sort { $a cmp $b } keys(%array1))
-{
- $code = $array1{$index};
- $count1--;
- if ($count1 == 0)
- {
- printf FILE " {0x%s, 0x%s, 0x%06x} /* %s */\n", substr($index, 0, 8),
- substr($index, 8, 8), $code, $comment1{$index};
- }
- else
- {
- printf FILE " {0x%s, 0x%s, 0x%06x}, /* %s */\n",
- substr($index, 0, 8), substr($index, 8, 8), $code,
- $comment1{$index};
- }
+ push @all, { direction => 'both', ucs => $ucs, code => $code, comment => $rest };
}
-
-print FILE "};\n";
close(FILE);
-if ($TEST == 1)
-{
- for $index (sort { $a cmp $b } keys(%array1))
- {
- $code = $array1{$index};
- if ( $code > 0x00
- && $code != 0x09
- && $code != 0x0a
- && $code != 0x0d
- && $code != 0x5c
- && ( $code < 0x80
- || ($code >= 0x8ea1 && $code <= 0x8efe)
- || ($code >= 0x8fa1a1 && $code <= 0x8ffefe)
- || ($code >= 0xa1a1 && $code <= 0x8fefe)))
- {
-
- $v1 = hex(substr($index, 0, 8));
- $v2 = hex(substr($index, 8, 8));
-
- for ($i = 3; $i >= 0; $i--)
- {
- $s = $i * 8;
- $mask = 0xff << $s;
- print FILE1 pack("C", ($v1 & $mask) >> $s) if $v1 & $mask;
- print FILE2 pack("C", ($code & $mask) >> $s) if $code & $mask;
- }
- for ($i = 3; $i >= 0; $i--)
- {
- $s = $i * 8;
- $mask = 0xff << $s;
- print FILE1 pack("C", ($v2 & $mask) >> $s) if $v2 & $mask;
- }
- print FILE1 "\n";
- print FILE2 "\n";
- }
- }
- close(FILE1);
- close(FILE2);
-}
-
-# then generate EUC_JIS_2004 --> UTF-8 table
-
-$in_file = "euc-jis-2004-std.txt";
-
-open(FILE, $in_file) || die("cannot open $in_file");
-
-reset 'array';
-reset 'array1';
-reset 'comment';
-reset 'comment1';
-
-while ($line = <FILE>)
-{
- if ($line =~ /^0x(.*)[ \t]*U\+(.*)\+(.*)[ \t]*#(.*)$/)
- {
- $c = $1;
- $u1 = $2;
- $u2 = $3;
- $rest = "U+" . $u1 . "+" . $u2 . $4;
- $code = hex($c);
- $ucs = hex($u1);
- $utf1 = &ucs2utf($ucs);
- $ucs = hex($u2);
- $utf2 = &ucs2utf($ucs);
- $str = sprintf "%08x%08x", $utf1, $utf2;
- $array1{$code} = $str;
- $comment1{$code} = $rest;
- $count1++;
- next;
- }
- elsif ($line =~ /^0x(.*)[ \t]*U\+(.*)[ \t]*#(.*)$/)
- {
- $c = $1;
- $u = $2;
- $rest = "U+" . $u . $3;
- }
- else
- {
- next;
- }
-
- $ucs = hex($u);
- $code = hex($c);
- $utf = &ucs2utf($ucs);
- if ($array{$code} ne "")
- {
- printf STDERR "Warning: duplicate UTF8: %04x\n", $ucs;
- next;
- }
- $count++;
-
- $array{$code} = $utf;
- $comment{$utf} = $rest;
-}
-close(FILE);
-
-$file = "euc_jis_2004_to_utf8.map";
-open(FILE, "> $file") || die("cannot open $file");
-print FILE "/*\n";
-print FILE " * This file was generated by UCS_to_EUC_JIS_2004.pl\n";
-print FILE " */\n";
-print FILE "static const pg_local_to_utf LUmapEUC_JIS_2004[] = {\n";
-
-for $index (sort { $a <=> $b } keys(%array))
-{
- $code = $array{$index};
- $count--;
- if ($count == 0)
- {
- printf FILE " {0x%06x, 0x%08x} /* %s */\n", $index, $code,
- $comment{$code};
- }
- else
- {
- printf FILE " {0x%06x, 0x%08x}, /* %s */\n", $index, $code,
- $comment{$code};
- }
-}
-
-print FILE "};\n";
-close(FILE);
-
-$file = "euc_jis_2004_to_utf8_combined.map";
-open(FILE, "> $file") || die("cannot open $file");
-print FILE "/*\n";
-print FILE " * This file was generated by UCS_to_EUC_JIS_2004.pl\n";
-print FILE " */\n";
-print FILE
- "static const pg_local_to_utf_combined LUmapEUC_JIS_2004_combined[] = {\n";
-
-for $index (sort { $a <=> $b } keys(%array1))
-{
- $code = $array1{$index};
- $count1--;
- if ($count1 == 0)
- {
- printf FILE " {0x%06x, 0x%s, 0x%s} /* %s */\n", $index,
- substr($code, 0, 8), substr($code, 8, 8), $comment1{$index};
- }
- else
- {
- printf FILE " {0x%06x, 0x%s, 0x%s}, /* %s */\n", $index,
- substr($code, 0, 8), substr($code, 8, 8), $comment1{$index};
- }
-}
-
-print FILE "};\n";
-close(FILE);
+print_tables("EUC_JIS_2004", \@all, 1);
diff --git a/src/backend/utils/mb/Unicode/UCS_to_EUC_JP.pl b/src/backend/utils/mb/Unicode/UCS_to_EUC_JP.pl
index 055fc849bae..0e9dd292bff 100755
--- a/src/backend/utils/mb/Unicode/UCS_to_EUC_JP.pl
+++ b/src/backend/utils/mb/Unicode/UCS_to_EUC_JP.pl
@@ -8,275 +8,223 @@
# map files provided by Unicode organization.
# Unfortunately it is prohibited by the organization
# to distribute the map files. So if you try to use this script,
-# you have to obtain JIS0201.TXT, JIS0208.TXT, JIS0212.TXT from
-# the organization's ftp site.
-#
-# JIS0201.TXT format:
-# JIS0201 code in hex
-# UCS-2 code in hex
-# # and Unicode name (not used in this script)
-#
-# JIS0208.TXT format:
-# JIS0208 shift-JIS code in hex
-# JIS0208 code in hex
-# UCS-2 code in hex
-# # and Unicode name (not used in this script)
-#
-# JIS0212.TXT format:
-# JIS0212 code in hex
-# UCS-2 code in hex
-# # and Unicode name (not used in this script)
-
-require "ucs2utf.pl";
-
-# first generate UTF-8 --> EUC_JP table
+# you have to obtain CP932.TXT and JIS0212.TXT from the
+# organization's ftp site.
-#
-# JIS0201
-#
-$in_file = "JIS0201.TXT";
+use strict;
+require "convutils.pm";
-open(FILE, $in_file) || die("cannot open $in_file");
+# Load JIS0212.TXT
+my $jis0212 = &read_source("JIS0212.TXT");
-reset 'array';
+my @mapping;
-while (<FILE>)
-{
- chop;
- if (/^#/)
+foreach my $i (@$jis0212) {
+ # We have a different mapping for this in the EUC_JP to UTF-8 direction.
+ if ($i->{code} == 0x2243)
{
- next;
+ $i->{direction} = "from_unicode";
}
- ($c, $u, $rest) = split;
- $ucs = hex($u);
- $code = hex($c);
- if ($code >= 0x80 && $ucs >= 0x0080)
- {
- $utf = &ucs2utf($ucs);
- if ($array{$utf} ne "")
- {
- printf STDERR "Warning: duplicate UTF8: %04x\n", $ucs;
- next;
- }
- $count++;
- # add single shift 2
- $array{$utf} = ($code | 0x8e00);
+ if ($i->{code} == 0x2271)
+ {
+ $i->{direction} = "to_unicode";
}
-}
-close(FILE);
-
-#
-# JIS0208
-#
-$in_file = "JIS0208.TXT";
-open(FILE, $in_file) || die("cannot open $in_file");
-
-while (<FILE>)
-{
- chop;
- if (/^#/)
+ if ($i->{ucs} >= 0x080)
{
- next;
+ $i->{code} = $i->{code} | 0x8f8080;
}
- ($s, $c, $u, $rest) = split;
- $ucs = hex($u);
- $code = hex($c);
- if ($code >= 0x80 && $ucs >= 0x0080)
+ else
{
- $utf = &ucs2utf($ucs);
- if ($array{$utf} ne "")
- {
- printf STDERR "Warning: duplicate UTF8: %04x\n", $ucs;
- next;
- }
- $count++;
-
- $array{$utf} = ($code | 0x8080);
+ next;
}
+
+ push @mapping, $i;
}
-close(FILE);
-#
-# JIS0212
-#
-$in_file = "JIS0212.TXT";
+# Load CP932.TXT.
+my $ct932 = &read_source("CP932.TXT");
-open(FILE, $in_file) || die("cannot open $in_file");
+foreach my $i (@$ct932) {
+ my $sjis = $i->{code};
-while (<FILE>)
-{
- chop;
- if (/^#/)
+ # We have a different mapping for this in the EUC_JP to UTF-8 direction.
+ if ($sjis == 0xeefa ||
+ $sjis == 0xeefb ||
+ $sjis == 0xeefc)
{
next;
}
- ($c, $u, $rest) = split;
- $ucs = hex($u);
- $code = hex($c);
- if ($code >= 0x80 && $ucs >= 0x0080)
- {
- $utf = &ucs2utf($ucs);
- if ($array{$utf} ne "")
- {
- printf STDERR "Warning: duplicate UTF8: %04x\n", $ucs;
- next;
- }
- $count++;
- $array{$utf} = ($code | 0x8f8080);
- }
-}
-close(FILE);
+ if ($sjis >= 0xa1)
+ {
+ my $jis = &sjis2jis($sjis);
-$file = "utf8_to_euc_jp.map";
-open(FILE, "> $file") || die("cannot open $file");
+ $i->{code} = $jis | ($jis < 0x100 ? 0x8e00 :
+ ($sjis >= 0xeffd ? 0x8f8080 : 0x8080));
-print FILE "/* src/backend/utils/mb/Unicode/$file */\n\n";
-print FILE "static const pg_utf_to_local ULmapEUC_JP[ $count ] = {\n";
+ # Remember the SJIS code for later.
+ $i->{sjis} = $sjis;
-for $index (sort { $a <=> $b } keys(%array))
-{
- $code = $array{$index};
- $count--;
- if ($count == 0)
- {
- printf FILE " {0x%04x, 0x%04x}\n", $index, $code;
- }
- else
- {
- printf FILE " {0x%04x, 0x%04x},\n", $index, $code;
+ push @mapping, $i;
}
}
-print FILE "};\n";
-close(FILE);
-
-#
-# then generate EUC_JP --> UTF8 table
-#
+foreach my $i (@mapping) {
+ my $sjis = $i->{sjis};
-#
-# JIS0201
-#
-$in_file = "JIS0201.TXT";
-
-open(FILE, $in_file) || die("cannot open $in_file");
-
-reset 'array';
-
-while (<FILE>)
-{
- chop;
- if (/^#/)
+ # These SJIS characters are excluded completely.
+ if ($sjis >= 0xed00 && $sjis <= 0xeef9 ||
+ $sjis >= 0xfa54 && $sjis <= 0xfa56 ||
+ $sjis >= 0xfa58 && $sjis <= 0xfc4b)
{
+ $i->{direction} = "none";
next;
}
- ($c, $u, $rest) = split;
- $ucs = hex($u);
- $code = hex($c);
- if ($code >= 0x80 && $ucs >= 0x0080)
- {
- $utf = &ucs2utf($ucs);
- if ($array{$code} ne "")
- {
- printf STDERR "Warning: duplicate code: %04x\n", $ucs;
- next;
- }
- $count++;
-
- # add single shift 2
- $code |= 0x8e00;
- $array{$code} = $utf;
- }
-}
-close(FILE);
-
-#
-# JIS0208
-#
-$in_file = "JIS0208.TXT";
-
-open(FILE, $in_file) || die("cannot open $in_file");
-while (<FILE>)
-{
- chop;
- if (/^#/)
+ # These SJIS characters are only in the UTF-8 to EUC_JP table
+ if ($sjis == 0xeefa || $sjis == 0xeefb || $sjis == 0xeefc)
{
+ $i->{direction} = "from_unicode";
next;
}
- ($s, $c, $u, $rest) = split;
- $ucs = hex($u);
- $code = hex($c);
- if ($code >= 0x80 && $ucs >= 0x0080)
- {
- $utf = &ucs2utf($ucs);
- if ($array{$code} ne "")
- {
- printf STDERR "Warning: duplicate code: %04x\n", $ucs;
- next;
- }
- $count++;
- $code |= 0x8080;
- $array{$code} = $utf;
+ if ($sjis == 0x8790 || $sjis == 0x8791 || $sjis == 0x8792 ||
+ $sjis == 0x8795 || $sjis == 0x8796 || $sjis == 0x8797 ||
+ $sjis == 0x879a || $sjis == 0x879b || $sjis == 0x879c ||
+ ($sjis >= 0xfa4a && $sjis <= 0xfa53))
+ {
+ $i->{direction} = "to_unicode";
+ next;
}
}
-close(FILE);
-#
-# JIS0212
-#
-$in_file = "JIS0212.TXT";
+push @mapping, (
+ {direction => 'both', ucs => 0x4efc, code => 0x8ff4af, comment => '# CJK(4EFC)'},
+ {direction => 'both', ucs => 0x50f4, code => 0x8ff4b0, comment => '# CJK(50F4)'},
+ {direction => 'both', ucs => 0x51EC, code => 0x8ff4b1, comment => '# CJK(51EC)'},
+ {direction => 'both', ucs => 0x5307, code => 0x8ff4b2, comment => '# CJK(5307)'},
+ {direction => 'both', ucs => 0x5324, code => 0x8ff4b3, comment => '# CJK(5324)'},
+ {direction => 'both', ucs => 0x548A, code => 0x8ff4b5, comment => '# CJK(548A)'},
+ {direction => 'both', ucs => 0x5759, code => 0x8ff4b6, comment => '# CJK(5759)'},
+ {direction => 'both', ucs => 0x589E, code => 0x8ff4b9, comment => '# CJK(589E)'},
+ {direction => 'both', ucs => 0x5BEC, code => 0x8ff4ba, comment => '# CJK(5BEC)'},
+ {direction => 'both', ucs => 0x5CF5, code => 0x8ff4bb, comment => '# CJK(5CF5)'},
+ {direction => 'both', ucs => 0x5D53, code => 0x8ff4bc, comment => '# CJK(5D53)'},
+ {direction => 'both', ucs => 0x5FB7, code => 0x8ff4be, comment => '# CJK(5FB7)'},
+ {direction => 'both', ucs => 0x6085, code => 0x8ff4bf, comment => '# CJK(6085)'},
+ {direction => 'both', ucs => 0x6120, code => 0x8ff4c0, comment => '# CJK(6120)'},
+ {direction => 'both', ucs => 0x654E, code => 0x8ff4c1, comment => '# CJK(654E)'},
+ {direction => 'both', ucs => 0x663B, code => 0x8ff4c2, comment => '# CJK(663B)'},
+ {direction => 'both', ucs => 0x6665, code => 0x8ff4c3, comment => '# CJK(6665)'},
+ {direction => 'both', ucs => 0x6801, code => 0x8ff4c6, comment => '# CJK(6801)'},
+ {direction => 'both', ucs => 0x6A6B, code => 0x8ff4c9, comment => '# CJK(6A6B)'},
+ {direction => 'both', ucs => 0x6AE2, code => 0x8ff4ca, comment => '# CJK(6AE2)'},
+ {direction => 'both', ucs => 0x6DF2, code => 0x8ff4cc, comment => '# CJK(6DF2)'},
+ {direction => 'both', ucs => 0x6DF8, code => 0x8ff4cb, comment => '# CJK(6DF8)'},
+ {direction => 'both', ucs => 0x7028, code => 0x8ff4cd, comment => '# CJK(7028)'},
+ {direction => 'both', ucs => 0x70BB, code => 0x8ff4ae, comment => '# CJK(70BB)'},
+ {direction => 'both', ucs => 0x7501, code => 0x8ff4d0, comment => '# CJK(7501)'},
+ {direction => 'both', ucs => 0x7682, code => 0x8ff4d1, comment => '# CJK(7682)'},
+ {direction => 'both', ucs => 0x769E, code => 0x8ff4d2, comment => '# CJK(769E)'},
+ {direction => 'both', ucs => 0x7930, code => 0x8ff4d4, comment => '# CJK(7930)'},
+ {direction => 'both', ucs => 0x7AE7, code => 0x8ff4d9, comment => '# CJK(7AE7)'},
+ {direction => 'both', ucs => 0x7DA0, code => 0x8ff4dc, comment => '# CJK(7DA0)'},
+ {direction => 'both', ucs => 0x7DD6, code => 0x8ff4dd, comment => '# CJK(7DD6)'},
+ {direction => 'both', ucs => 0x8362, code => 0x8ff4df, comment => '# CJK(8362)'},
+ {direction => 'both', ucs => 0x85B0, code => 0x8ff4e1, comment => '# CJK(85B0)'},
+ {direction => 'both', ucs => 0x8807, code => 0x8ff4e4, comment => '# CJK(8807)'},
+ {direction => 'both', ucs => 0x8B7F, code => 0x8ff4e6, comment => '# CJK(8B7F)'},
+ {direction => 'both', ucs => 0x8CF4, code => 0x8ff4e7, comment => '# CJK(8CF4)'},
+ {direction => 'both', ucs => 0x8D76, code => 0x8ff4e8, comment => '# CJK(8D76)'},
+ {direction => 'both', ucs => 0x90DE, code => 0x8ff4ec, comment => '# CJK(90DE)'},
+ {direction => 'both', ucs => 0x9115, code => 0x8ff4ee, comment => '# CJK(9115)'},
+ {direction => 'both', ucs => 0x9592, code => 0x8ff4f1, comment => '# CJK(9592)'},
+ {direction => 'both', ucs => 0x973B, code => 0x8ff4f4, comment => '# CJK(973B)'},
+ {direction => 'both', ucs => 0x974D, code => 0x8ff4f5, comment => '# CJK(974D)'},
+ {direction => 'both', ucs => 0x9751, code => 0x8ff4f6, comment => '# CJK(9751)'},
+ {direction => 'both', ucs => 0x999E, code => 0x8ff4fa, comment => '# CJK(999E)'},
+ {direction => 'both', ucs => 0x9AD9, code => 0x8ff4fb, comment => '# CJK(9AD9)'},
+ {direction => 'both', ucs => 0x9B72, code => 0x8ff4fc, comment => '# CJK(9B72)'},
+ {direction => 'both', ucs => 0x9ED1, code => 0x8ff4fe, comment => '# CJK(9ED1)'},
+ {direction => 'both', ucs => 0xF929, code => 0x8ff4c5, comment => '# CJK COMPATIBILITY IDEOGRAPH-F929'},
+ {direction => 'both', ucs => 0xF9DC, code => 0x8ff4f2, comment => '# CJK COMPATIBILITY IDEOGRAPH-F9DC'},
+ {direction => 'both', ucs => 0xFA0E, code => 0x8ff4b4, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA0E'},
+ {direction => 'both', ucs => 0xFA0F, code => 0x8ff4b7, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA0F'},
+ {direction => 'both', ucs => 0xFA10, code => 0x8ff4b8, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA10'},
+ {direction => 'both', ucs => 0xFA11, code => 0x8ff4bd, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA11'},
+ {direction => 'both', ucs => 0xFA12, code => 0x8ff4c4, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA12'},
+ {direction => 'both', ucs => 0xFA13, code => 0x8ff4c7, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA13'},
+ {direction => 'both', ucs => 0xFA14, code => 0x8ff4c8, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA14'},
+ {direction => 'both', ucs => 0xFA15, code => 0x8ff4ce, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA15'},
+ {direction => 'both', ucs => 0xFA16, code => 0x8ff4cf, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA16'},
+ {direction => 'both', ucs => 0xFA17, code => 0x8ff4d3, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA17'},
+ {direction => 'both', ucs => 0xFA18, code => 0x8ff4d5, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA18'},
+ {direction => 'both', ucs => 0xFA19, code => 0x8ff4d6, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA19'},
+ {direction => 'both', ucs => 0xFA1A, code => 0x8ff4d7, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA1A'},
+ {direction => 'both', ucs => 0xFA1B, code => 0x8ff4d8, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA1B'},
+ {direction => 'both', ucs => 0xFA1C, code => 0x8ff4da, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA1C'},
+ {direction => 'both', ucs => 0xFA1D, code => 0x8ff4db, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA1D'},
+ {direction => 'both', ucs => 0xFA1E, code => 0x8ff4de, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA1E'},
+ {direction => 'both', ucs => 0xFA1F, code => 0x8ff4e0, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA1F'},
+ {direction => 'both', ucs => 0xFA20, code => 0x8ff4e2, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA20'},
+ {direction => 'both', ucs => 0xFA21, code => 0x8ff4e3, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA21'},
+ {direction => 'both', ucs => 0xFA22, code => 0x8ff4e5, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA22'},
+ {direction => 'both', ucs => 0xFA23, code => 0x8ff4e9, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA23'},
+ {direction => 'both', ucs => 0xFA24, code => 0x8ff4ea, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA24'},
+ {direction => 'both', ucs => 0xFA25, code => 0x8ff4eb, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA25'},
+ {direction => 'both', ucs => 0xFA26, code => 0x8ff4ed, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA26'},
+ {direction => 'both', ucs => 0xFA27, code => 0x8ff4ef, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA27'},
+ {direction => 'both', ucs => 0xFA28, code => 0x8ff4f0, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA28'},
+ {direction => 'both', ucs => 0xFA29, code => 0x8ff4f3, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA29'},
+ {direction => 'both', ucs => 0xFA2A, code => 0x8ff4f7, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA2A'},
+ {direction => 'both', ucs => 0xFA2B, code => 0x8ff4f8, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA2B'},
+ {direction => 'both', ucs => 0xFA2C, code => 0x8ff4f9, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA2C'},
+ {direction => 'both', ucs => 0xFA2D, code => 0x8ff4fd, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA2D'},
+ {direction => 'both', ucs => 0xFF07, code => 0x8ff4a9, comment => '# FULLWIDTH APOSTROPHE'},
+ {direction => 'both', ucs => 0xFFE4, code => 0x8fa2c3, comment => '# FULLWIDTH BROKEN BAR'},
+
+ # additional conversions for EUC_JP -> UTF-8 conversion
+ {direction => 'to_unicode', ucs => 0x2116, code => 0x8ff4ac, comment => '# NUMERO SIGN'},
+ {direction => 'to_unicode', ucs => 0x2121, code => 0x8ff4ad, comment => '# TELEPHONE SIGN'},
+ {direction => 'to_unicode', ucs => 0x3231, code => 0x8ff4ab, comment => '# PARENTHESIZED IDEOGRAPH STOCK'}
+ );
+
+print_tables("EUC_JP", \@mapping);
+
+#######################################################################
+# sjis2jis ; SJIS => JIS conversion
+sub sjis2jis
+{
+ my ($sjis) = @_;
-open(FILE, $in_file) || die("cannot open $in_file");
+ return $sjis if ($sjis <= 0x100);
-while (<FILE>)
-{
- chop;
- if (/^#/)
- {
- next;
- }
- ($c, $u, $rest) = split;
- $ucs = hex($u);
- $code = hex($c);
- if ($code >= 0x80 && $ucs >= 0x0080)
+ my $hi = $sjis >> 8;
+ my $lo = $sjis & 0xff;
+
+ if ($lo >= 0x80) { $lo--; }
+ $lo -= 0x40;
+ if ($hi >= 0xe0) { $hi -= 0x40; }
+ $hi -= 0x81;
+ my $pos = $lo + $hi * 0xbc;
+
+ if ($pos >= 114 * 0x5e && $pos <= 115 * 0x5e + 0x1b)
{
- $utf = &ucs2utf($ucs);
- if ($array{$code} ne "")
- {
- printf STDERR "Warning: duplicate code: %04x\n", $ucs;
- next;
- }
- $count++;
+ # This region (115-ku) is out of range of JIS code but for
+ # convenient to generate code in EUC CODESET 3, move this to
+ # seemingly duplicate region (83-84-ku).
+ $pos = $pos - ((31 * 0x5e) + 12);
- $code |= 0x8f8080;
- $array{$code} = $utf;
+ # after 85-ku 82-ten needs to be moved 2 codepoints
+ $pos = $pos - 2 if ($pos >= 84 * 0x5c + 82)
}
-}
-close(FILE);
-$file = "euc_jp_to_utf8.map";
-open(FILE, "> $file") || die("cannot open $file");
+ my $hi2 = $pos / 0x5e;
+ my $lo2 = ($pos % 0x5e);
-print FILE "/* src/backend/utils/mb/Unicode/$file */\n\n";
-print FILE "static const pg_local_to_utf LUmapEUC_JP[ $count ] = {\n";
-for $index (sort { $a <=> $b } keys(%array))
-{
- $utf = $array{$index};
- $count--;
- if ($count == 0)
- {
- printf FILE " {0x%04x, 0x%04x}\n", $index, $utf;
- }
- else
- {
- printf FILE " {0x%04x, 0x%04x},\n", $index, $utf;
- }
-}
+ my $ret = $lo2 + 0x21 + (($hi2 + 0x21) << 8);
-print FILE "};\n";
-close(FILE);
+ return $ret;
+}
diff --git a/src/backend/utils/mb/Unicode/UCS_to_EUC_KR.pl b/src/backend/utils/mb/Unicode/UCS_to_EUC_KR.pl
index a7c94bca915..a917d067172 100755
--- a/src/backend/utils/mb/Unicode/UCS_to_EUC_KR.pl
+++ b/src/backend/utils/mb/Unicode/UCS_to_EUC_KR.pl
@@ -16,113 +16,22 @@
# UCS-2 code in hex
# # and Unicode name (not used in this script)
-require "ucs2utf.pl";
+require "convutils.pm";
-# first generate UTF-8 --> EUC_KR table
+# Load the source file.
-$in_file = "KSX1001.TXT";
+my $mapping = &read_source("KSX1001.TXT");
-open(FILE, $in_file) || die("cannot open $in_file");
-
-while (<FILE>)
-{
- chop;
- if (/^#/)
- {
- next;
- }
- ($c, $u, $rest) = split;
- $ucs = hex($u);
- $code = hex($c);
- if ($code >= 0x80 && $ucs >= 0x0080)
- {
- $utf = &ucs2utf($ucs);
- if ($array{$utf} ne "")
- {
- printf STDERR "Warning: duplicate UTF8: %04x\n", $ucs;
- next;
- }
- $count++;
-
- $array{$utf} = ($code | 0x8080);
- }
-}
-close(FILE);
-
-$file = "utf8_to_euc_kr.map";
-open(FILE, "> $file") || die("cannot open $file");
-
-print FILE "/* src/backend/utils/mb/Unicode/$file */\n\n";
-print FILE "static const pg_utf_to_local ULmapEUC_KR[ $count ] = {\n";
-
-for $index (sort { $a <=> $b } keys(%array))
+foreach my $i (@$mapping)
{
- $code = $array{$index};
- $count--;
- if ($count == 0)
- {
- printf FILE " {0x%04x, 0x%04x}\n", $index, $code;
- }
- else
- {
- printf FILE " {0x%04x, 0x%04x},\n", $index, $code;
- }
+ $i->{code} = $i->{code} | 0x8080;
}
-print FILE "};\n";
-close(FILE);
-
-#
-# then generate EUC_KR --> UTF8 table
-#
-reset 'array';
-
-open(FILE, $in_file) || die("cannot open $in_file");
-
-while (<FILE>)
-{
- chop;
- if (/^#/)
- {
- next;
- }
- ($c, $u, $rest) = split;
- $ucs = hex($u);
- $code = hex($c);
- if ($code >= 0x80 && $ucs >= 0x0080)
- {
- $utf = &ucs2utf($ucs);
- if ($array{$code} ne "")
- {
- printf STDERR "Warning: duplicate code: %04x\n", $ucs;
- next;
- }
- $count++;
-
- $code |= 0x8080;
- $array{$code} = $utf;
- }
-}
-close(FILE);
-
-$file = "euc_kr_to_utf8.map";
-open(FILE, "> $file") || die("cannot open $file");
-
-print FILE "/* src/backend/utils/mb/Unicode/$file */\n\n";
-print FILE "static const pg_local_to_utf LUmapEUC_KR[ $count ] = {\n";
-for $index (sort { $a <=> $b } keys(%array))
-{
- $utf = $array{$index};
- $count--;
- if ($count == 0)
- {
- printf FILE " {0x%04x, 0x%04x}\n", $index, $utf;
- }
- else
- {
- printf FILE " {0x%04x, 0x%04x},\n", $index, $utf;
- }
-}
+# Some extra characters that are not in KSX1001.TXT
+push @$mapping, (
+ {direction => 'both', ucs => 0x20AC, code => 0xa2e6, comment => '# EURO SIGN'},
+ {direction => 'both', ucs => 0x00AE, code => 0xa2e7, comment => '# REGISTERED SIGN'},
+ {direction => 'both', ucs => 0x327E, code => 0xa2e8, comment => '# CIRCLED HANGUL IEUNG U'}
+ );
-print FILE "};\n";
-close(FILE);
+print_tables("EUC_KR", $mapping);
diff --git a/src/backend/utils/mb/Unicode/UCS_to_EUC_TW.pl b/src/backend/utils/mb/Unicode/UCS_to_EUC_TW.pl
index e4fc535b180..aceef5433c2 100755
--- a/src/backend/utils/mb/Unicode/UCS_to_EUC_TW.pl
+++ b/src/backend/utils/mb/Unicode/UCS_to_EUC_TW.pl
@@ -17,141 +17,47 @@
# UCS-2 code in hex
# # and Unicode name (not used in this script)
-require "ucs2utf.pl";
+require "convutils.pm";
-# first generate UTF-8 --> EUC_TW table
+my $mapping = &read_source("CNS11643.TXT");
-$in_file = "CNS11643.TXT";
+my @extras;
-open(FILE, $in_file) || die("cannot open $in_file");
-
-while (<FILE>)
+foreach my $i (@$mapping)
{
- chop;
- if (/^#/)
+ my $ucs = $i->{ucs};
+ my $code = $i->{code};
+ my $origcode = $i->{code};
+
+ my $plane = ($code & 0x1f0000) >> 16;
+ if ($plane > 16)
{
+ printf STDERR "Warning: invalid plane No.$plane. ignored\n";
next;
}
- ($c, $u, $rest) = split;
- $ucs = hex($u);
- $code = hex($c);
- if ($code >= 0x80 && $ucs >= 0x0080)
- {
- $utf = &ucs2utf($ucs);
- if ($array{$utf} ne "")
- {
- printf STDERR "Warning: duplicate UTF8: %04x\n", $ucs;
- next;
- }
- $count++;
-
- $plane = ($code & 0x1f0000) >> 16;
- if ($plane > 16)
- {
- printf STDERR "Warning: invalid plane No.$plane. ignored\n";
- next;
- }
-
- if ($plane == 1)
- {
- $array{$utf} = (($code & 0xffff) | 0x8080);
- }
- else
- {
- $array{$utf} =
- (0x8ea00000 + ($plane << 16)) | (($code & 0xffff) | 0x8080);
- }
- }
-}
-close(FILE);
-
-$file = "utf8_to_euc_tw.map";
-open(FILE, "> $file") || die("cannot open $file");
-print FILE "/* src/backend/utils/mb/Unicode/$file */\n\n";
-print FILE "static const pg_utf_to_local ULmapEUC_TW[ $count ] = {\n";
-
-for $index (sort { $a <=> $b } keys(%array))
-{
- $code = $array{$index};
- $count--;
- if ($count == 0)
+ if ($plane == 1)
{
- printf FILE " {0x%04x, 0x%04x}\n", $index, $code;
+ $code = ($code & 0xffff) | 0x8080;
}
else
{
- printf FILE " {0x%04x, 0x%04x},\n", $index, $code;
+ $code = (0x8ea00000 + ($plane << 16)) | (($code & 0xffff) | 0x8080);
}
-}
-
-print FILE "};\n";
-close(FILE);
-
-#
-# then generate EUC_TW --> UTF8 table
-#
-reset 'array';
-
-open(FILE, $in_file) || die("cannot open $in_file");
+ $i->{code} = $code;
-while (<FILE>)
-{
- chop;
- if (/^#/)
+ # Some codes are mapped twice in the EUC_TW to UTF-8 table.
+ if ($origcode >= 0x12121 && $origcode <= 0x20000)
{
- next;
- }
- ($c, $u, $rest) = split;
- $ucs = hex($u);
- $code = hex($c);
- if ($code >= 0x80 && $ucs >= 0x0080)
- {
- $utf = &ucs2utf($ucs);
- if ($array{$code} ne "")
- {
- printf STDERR "Warning: duplicate code: %04x\n", $ucs;
- next;
- }
- $count++;
-
- $plane = ($code & 0x1f0000) >> 16;
- if ($plane > 16)
- {
- printf STDERR "Warning: invalid plane No.$plane. ignored\n";
- next;
- }
-
- if ($plane == 1)
- {
- $c = (($code & 0xffff) | 0x8080);
- $array{$c} = $utf;
- $count++;
+ push @extras, {
+ ucs => $i->{ucs},
+ code => ($i->{code} + 0x8ea10000),
+ rest => $i->{rest},
+ direction => 'to_unicode'
}
- $c = (0x8ea00000 + ($plane << 16)) | (($code & 0xffff) | 0x8080);
- $array{$c} = $utf;
}
}
-close(FILE);
-
-$file = "euc_tw_to_utf8.map";
-open(FILE, "> $file") || die("cannot open $file");
-print FILE "/* src/backend/utils/mb/Unicode/$file */\n\n";
-print FILE "static const pg_local_to_utf LUmapEUC_TW[ $count ] = {\n";
-for $index (sort { $a <=> $b } keys(%array))
-{
- $utf = $array{$index};
- $count--;
- if ($count == 0)
- {
- printf FILE " {0x%04x, 0x%04x}\n", $index, $utf;
- }
- else
- {
- printf FILE " {0x%04x, 0x%04x},\n", $index, $utf;
- }
-}
+push @$mapping, @extras;
-print FILE "};\n";
-close(FILE);
+print_tables("EUC_TW", $mapping);
diff --git a/src/backend/utils/mb/Unicode/UCS_to_GB18030.pl b/src/backend/utils/mb/Unicode/UCS_to_GB18030.pl
index 043c1c27ec8..f58361024e4 100755
--- a/src/backend/utils/mb/Unicode/UCS_to_GB18030.pl
+++ b/src/backend/utils/mb/Unicode/UCS_to_GB18030.pl
@@ -13,8 +13,7 @@
# where the "u" field is the Unicode code point in hex,
# and the "b" field is the hex byte sequence for GB18030
-require "ucs2utf.pl";
-
+require "convutils.pm";
# Read the input
@@ -22,6 +21,8 @@ $in_file = "gb-18030-2000.xml";
open(FILE, $in_file) || die("cannot open $in_file");
+my @mapping;
+
while (<FILE>)
{
next if (!m/<a u="([0-9A-F]+)" b="([0-9A-F ]+)"/);
@@ -32,78 +33,13 @@ while (<FILE>)
$code = hex($c);
if ($code >= 0x80 && $ucs >= 0x0080)
{
- $utf = &ucs2utf($ucs);
- if ($arrayu{$utf} ne "")
- {
- printf STDERR "Warning: duplicate UTF8: %04x\n", $ucs;
- next;
+ push @mapping, {
+ ucs => $ucs,
+ code => $code,
+ direction => 'both'
}
- if ($arrayc{$code} ne "")
- {
- printf STDERR "Warning: duplicate GB18030: %08x\n", $code;
- next;
- }
- $arrayu{$utf} = $code;
- $arrayc{$code} = $utf;
- $count++;
- }
-}
-close(FILE);
-
-
-#
-# first, generate UTF8 --> GB18030 table
-#
-
-$file = "utf8_to_gb18030.map";
-open(FILE, "> $file") || die("cannot open $file");
-
-print FILE "/* src/backend/utils/mb/Unicode/$file */\n\n";
-print FILE "static const pg_utf_to_local ULmapGB18030[ $count ] = {\n";
-
-$cc = $count;
-for $index (sort { $a <=> $b } keys(%arrayu))
-{
- $code = $arrayu{$index};
- $cc--;
- if ($cc == 0)
- {
- printf FILE " {0x%04x, 0x%04x}\n", $index, $code;
- }
- else
- {
- printf FILE " {0x%04x, 0x%04x},\n", $index, $code;
}
}
-
-print FILE "};\n";
close(FILE);
-
-#
-# then generate GB18030 --> UTF8 table
-#
-
-$file = "gb18030_to_utf8.map";
-open(FILE, "> $file") || die("cannot open $file");
-
-print FILE "/* src/backend/utils/mb/Unicode/$file */\n\n";
-print FILE "static const pg_local_to_utf LUmapGB18030[ $count ] = {\n";
-
-$cc = $count;
-for $index (sort { $a <=> $b } keys(%arrayc))
-{
- $utf = $arrayc{$index};
- $cc--;
- if ($cc == 0)
- {
- printf FILE " {0x%04x, 0x%04x}\n", $index, $utf;
- }
- else
- {
- printf FILE " {0x%04x, 0x%04x},\n", $index, $utf;
- }
-}
-
-print FILE "};\n";
-close(FILE);
+print_tables("GB18030", \@mapping);
diff --git a/src/backend/utils/mb/Unicode/UCS_to_JOHAB.pl b/src/backend/utils/mb/Unicode/UCS_to_JOHAB.pl
new file mode 100755
index 00000000000..b98f9a7bf55
--- /dev/null
+++ b/src/backend/utils/mb/Unicode/UCS_to_JOHAB.pl
@@ -0,0 +1,31 @@
+#! /usr/bin/perl
+#
+# Copyright (c) 2001-2016, PostgreSQL Global Development Group
+#
+# src/backend/utils/mb/Unicode/UCS_to_JOHAB.pl
+#
+# Generate UTF-8 <--> JOHAB conversion tables from
+# map files provided by Unicode organization.
+# Unfortunately it is prohibited by the organization
+# to distribute the map files. So if you try to use this script,
+# you have to obtain the map files from the organization's ftp site.
+# ftp://www.unicode.org/Public/MAPPINGS/
+# We assume the file include three tab-separated columns:
+# JOHAB code in hex
+# UCS-2 code in hex
+# # and Unicode name (not used in this script)
+
+require "convutils.pm";
+
+# Load the source file.
+
+my $mapping = &read_source("JOHAB.TXT");
+
+# Some extra characters that are not in JOHAB.TXT
+push @$mapping, (
+ {direction => 'both', ucs => 0x20AC, code => 0xd9e6, comment => '# EURO SIGN'},
+ {direction => 'both', ucs => 0x00AE, code => 0xd9e7, comment => '# REGISTERED SIGN'},
+ {direction => 'both', ucs => 0x327E, code => 0xd9e8, comment => '# CIRCLED HANGUL IEUNG U'}
+ );
+
+print_tables("JOHAB", $mapping);
diff --git a/src/backend/utils/mb/Unicode/UCS_to_SHIFT_JIS_2004.pl b/src/backend/utils/mb/Unicode/UCS_to_SHIFT_JIS_2004.pl
index 51ffd86b2c9..16a53ad1d9f 100755
--- a/src/backend/utils/mb/Unicode/UCS_to_SHIFT_JIS_2004.pl
+++ b/src/backend/utils/mb/Unicode/UCS_to_SHIFT_JIS_2004.pl
@@ -7,7 +7,7 @@
# Generate UTF-8 <--> SHIFT_JIS_2004 code conversion tables from
# "sjis-0213-2004-std.txt" (http://x0213.org)
-require "ucs2utf.pl";
+require "convutils.pm";
# first generate UTF-8 --> SHIFT_JIS_2004 table
@@ -15,10 +15,7 @@ $in_file = "sjis-0213-2004-std.txt";
open(FILE, $in_file) || die("cannot open $in_file");
-reset 'array';
-reset 'array1';
-reset 'comment';
-reset 'comment1';
+my @mapping;
while ($line = <FILE>)
{
@@ -29,14 +26,16 @@ while ($line = <FILE>)
$u2 = $3;
$rest = "U+" . $u1 . "+" . $u2 . $4;
$code = hex($c);
- $ucs = hex($u1);
- $utf1 = &ucs2utf($ucs);
- $ucs = hex($u2);
- $utf2 = &ucs2utf($ucs);
- $str = sprintf "%08x%08x", $utf1, $utf2;
- $array1{$str} = $code;
- $comment1{$str} = $rest;
- $count1++;
+ $ucs1 = hex($u1);
+ $ucs2 = hex($u2);
+
+ push @mapping, {
+ code => $code,
+ ucs => $ucs1,
+ ucs_second => $ucs2,
+ comment => $rest,
+ direction => 'both'
+ };
next;
}
elsif ($line =~ /^0x(.*)[ \t]*U\+(.*)[ \t]*#(.*)$/)
@@ -52,183 +51,31 @@ while ($line = <FILE>)
$ucs = hex($u);
$code = hex($c);
- $utf = &ucs2utf($ucs);
- if ($array{$utf} ne "")
- {
- printf STDERR
- "Warning: duplicate UTF8: %08x UCS: %04x Shift JIS: %04x\n", $utf,
- $ucs, $code;
- next;
- }
- $count++;
- $array{$utf} = $code;
- $comment{$code} = $rest;
-}
-close(FILE);
-
-$file = "utf8_to_shift_jis_2004.map";
-open(FILE, "> $file") || die("cannot open $file");
-print FILE "/*\n";
-print FILE " * This file was generated by UCS_to_SHIFT_JIS_2004.pl\n";
-print FILE " */\n";
-print FILE "static const pg_utf_to_local ULmapSHIFT_JIS_2004[] = {\n";
-
-for $index (sort { $a <=> $b } keys(%array))
-{
- $code = $array{$index};
- $count--;
- if ($count == 0)
- {
- printf FILE " {0x%08x, 0x%06x} /* %s */\n", $index, $code,
- $comment{$code};
- }
- else
+ if ($code < 0x80 && $ucs < 0x80)
{
- printf FILE " {0x%08x, 0x%06x}, /* %s */\n", $index, $code,
- $comment{$code};
- }
-}
-
-print FILE "};\n";
-close(FILE);
-
-$file = "utf8_to_shift_jis_2004_combined.map";
-open(FILE, "> $file") || die("cannot open $file");
-print FILE "/*\n";
-print FILE " * This file was generated by UCS_to_SHIFT_JIS_2004.pl\n";
-print FILE " */\n";
-print FILE
-"static const pg_utf_to_local_combined ULmapSHIFT_JIS_2004_combined[] = {\n";
-
-for $index (sort { $a cmp $b } keys(%array1))
-{
- $code = $array1{$index};
- $count1--;
- if ($count1 == 0)
- {
- printf FILE " {0x%s, 0x%s, 0x%04x} /* %s */\n", substr($index, 0, 8),
- substr($index, 8, 8), $code, $comment1{$index};
- }
- else
- {
- printf FILE " {0x%s, 0x%s, 0x%04x}, /* %s */\n",
- substr($index, 0, 8), substr($index, 8, 8), $code,
- $comment1{$index};
- }
-}
-
-print FILE "};\n";
-close(FILE);
-
-# then generate SHIFT_JIS_2004 --> UTF-8 table
-
-$in_file = "sjis-0213-2004-std.txt";
-
-open(FILE, $in_file) || die("cannot open $in_file");
-
-reset 'array';
-reset 'array1';
-reset 'comment';
-reset 'comment1';
-
-while ($line = <FILE>)
-{
- if ($line =~ /^0x(.*)[ \t]*U\+(.*)\+(.*)[ \t]*#(.*)$/)
- {
- $c = $1;
- $u1 = $2;
- $u2 = $3;
- $rest = "U+" . $u1 . "+" . $u2 . $4;
- $code = hex($c);
- $ucs = hex($u1);
- $utf1 = &ucs2utf($ucs);
- $ucs = hex($u2);
- $utf2 = &ucs2utf($ucs);
- $str = sprintf "%08x%08x", $utf1, $utf2;
- $array1{$code} = $str;
- $comment1{$code} = $rest;
- $count1++;
next;
}
- elsif ($line =~ /^0x(.*)[ \t]*U\+(.*)[ \t]*#(.*)$/)
+ elsif ($code < 0x80)
{
- $c = $1;
- $u = $2;
- $rest = "U+" . $u . $3;
+ $direction = 'from_unicode';
}
- else
- {
- next;
- }
-
- $ucs = hex($u);
- $code = hex($c);
- $utf = &ucs2utf($ucs);
- if ($array{$code} ne "")
- {
- printf STDERR
- "Warning: duplicate UTF8: %08x UCS: %04x Shift JIS: %04x\n", $utf,
- $ucs, $code;
- printf STDERR "Previous value: UTF8: %08x\n", $array{$utf};
- next;
- }
- $count++;
-
- $array{$code} = $utf;
- $comment{$utf} = $rest;
-}
-close(FILE);
-
-$file = "shift_jis_2004_to_utf8.map";
-open(FILE, "> $file") || die("cannot open $file");
-print FILE "/*\n";
-print FILE " * This file was generated by UCS_to_SHIFTJIS_2004.pl\n";
-print FILE " */\n";
-print FILE "static const pg_local_to_utf LUmapSHIFT_JIS_2004[] = {\n";
-
-for $index (sort { $a <=> $b } keys(%array))
-{
- $code = $array{$index};
- $count--;
- if ($count == 0)
+ elsif ($ucs < 0x80)
{
- printf FILE " {0x%04x, 0x%08x} /* %s */\n", $index, $code,
- $comment{$code};
+ $direction = 'to_unicode';
}
else
{
- printf FILE " {0x%04x, 0x%08x}, /* %s */\n", $index, $code,
- $comment{$code};
+ $direction = 'both';
}
-}
-
-print FILE "};\n";
-close(FILE);
-
-$file = "shift_jis_2004_to_utf8_combined.map";
-open(FILE, "> $file") || die("cannot open $file");
-print FILE "/*\n";
-print FILE " * This file was generated by UCS_to_SHIFT_JIS_2004.pl\n";
-print FILE " */\n";
-print FILE
-"static const pg_local_to_utf_combined LUmapSHIFT_JIS_2004_combined[] = {\n";
-for $index (sort { $a <=> $b } keys(%array1))
-{
- $code = $array1{$index};
- $count1--;
- if ($count1 == 0)
- {
- printf FILE " {0x%04x, 0x%s, 0x%s} /* %s */\n", $index,
- substr($code, 0, 8), substr($code, 8, 8), $comment1{$index};
- }
- else
- {
- printf FILE " {0x%04x, 0x%s, 0x%s}, /* %s */\n", $index,
- substr($code, 0, 8), substr($code, 8, 8), $comment1{$index};
- }
+ push @mapping, {
+ code => $code,
+ ucs => $ucs,
+ comment => $rest,
+ direction => $direction
+ };
}
-
-print FILE "};\n";
close(FILE);
+
+print_tables("SHIFT_JIS_2004", \@mapping, 1);
diff --git a/src/backend/utils/mb/Unicode/UCS_to_SJIS.pl b/src/backend/utils/mb/Unicode/UCS_to_SJIS.pl
index 10e54b157d2..c8ff712af8f 100755
--- a/src/backend/utils/mb/Unicode/UCS_to_SJIS.pl
+++ b/src/backend/utils/mb/Unicode/UCS_to_SJIS.pl
@@ -4,138 +4,45 @@
#
# src/backend/utils/mb/Unicode/UCS_to_SJIS.pl
#
-# Generate UTF-8 <--> SJIS code conversion tables from
-# map files provided by Unicode organization.
-# Unfortunately it is prohibited by the organization
-# to distribute the map files. So if you try to use this script,
-# you have to obtain SHIFTJIS.TXT from
-# the organization's ftp site.
-#
-# SHIFTJIS.TXT format:
-# SHIFTJIS code in hex
-# UCS-2 code in hex
-# # and Unicode name (not used in this script)
-# Warning: SHIFTJIS.TXT contains only JIS0201 and JIS0208. no JIS0212.
-
-require "ucs2utf.pl";
-
-# first generate UTF-8 --> SJIS table
-
-$in_file = "CP932.TXT";
-$count = 0;
-
-open(FILE, $in_file) || die("cannot open $in_file");
-
-while (<FILE>)
-{
- chop;
- if (/^#/)
- {
- next;
- }
- ($c, $u, $rest) = split;
- $ucs = hex($u);
- $code = hex($c);
- if ($code >= 0x80 && $ucs >= 0x0080)
- {
- $utf = &ucs2utf($ucs);
- if ((($code >= 0xed40) && ($code <= 0xeefc))
- || ( ($code >= 0x8754)
- && ($code <= 0x875d))
- || ($code == 0x878a)
- || ($code == 0x8782)
- || ($code == 0x8784)
- || ($code == 0xfa5b)
- || ($code == 0xfa54)
- || ( ($code >= 0x8790)
- && ($code <= 0x8792))
- || ( ($code >= 0x8795)
- && ($code <= 0x8797))
- || ( ($code >= 0x879a)
- && ($code <= 0x879c)))
- {
- printf STDERR "Warning: duplicate UTF8: UCS=0x%04x SJIS=0x%04x\n",
- $ucs,
- $code;
- next;
- }
- $count++;
- $array{$utf} = $code;
- }
-}
+# Generate UTF-8 <=> SJIS code conversion radix tree Generate UTF-8
+# <=> SJIS code conversion radix tree Unfortunately it is prohibited
+# by the organization to distribute the map files. So if you try to
+# use this script, you have to obtain CP932.TXT from the organization's
+# ftp site.
-close(FILE);
+use strict;
+require "convutils.pm";
-$file = "utf8_to_sjis.map";
-open(FILE, "> $file") || die("cannot open $file");
+my $charset = read_source("CP932.TXT");
-print FILE "/* src/backend/utils/mb/Unicode/$file */\n\n";
-print FILE "static const pg_utf_to_local ULmapSJIS[ $count ] = {\n";
+# Drop these SJIS codes from the source for UTF8=>SJIS conversion
+my @reject_sjis =(
+ 0xed40..0xeefc, 0x8754..0x875d, 0x878a, 0x8782,
+ 0x8784, 0xfa5b, 0xfa54, 0x8790..0x8792, 0x8795..0x8797,
+ 0x879a..0x879c
+);
-for $index (sort { $a <=> $b } keys(%array))
+foreach my $i (@$charset)
{
- $code = $array{$index};
- $count--;
- if ($count == 0)
- {
- printf FILE " {0x%04x, 0x%04x}\n", $index, $code;
- }
- else
- {
- printf FILE " {0x%04x, 0x%04x},\n", $index, $code;
- }
-}
-
-print FILE "};\n";
-close(FILE);
+ my $code = $i->{code};
+ my $ucs = $i->{ucs};
-#
-# then generate SJIS --> UTF8 table
-#
-
-open(FILE, $in_file) || die("cannot open $in_file");
-
-reset 'array';
-$count = 0;
-
-while (<FILE>)
-{
- chop;
- if (/^#/)
- {
- next;
- }
- ($c, $u, $rest) = split;
- $ucs = hex($u);
- $code = hex($c);
- if ($code >= 0x80 && $ucs >= 0x0080)
- {
- $utf = &ucs2utf($ucs);
- $count++;
-
- $array{$code} = $utf;
- }
-}
-close(FILE);
-
-$file = "sjis_to_utf8.map";
-open(FILE, "> $file") || die("cannot open $file");
-
-print FILE "/* src/backend/utils/mb/Unicode/$file */\n\n";
-print FILE "static const pg_local_to_utf LUmapSJIS[ $count ] = {\n";
-for $index (sort { $a <=> $b } keys(%array))
-{
- $utf = $array{$index};
- $count--;
- if ($count == 0)
- {
- printf FILE " {0x%04x, 0x%04x}\n", $index, $utf;
- }
- else
+ if (grep {$code == $_} @reject_sjis)
{
- printf FILE " {0x%04x, 0x%04x},\n", $index, $utf;
+ $i->{direction} = "to_unicode";
}
}
-print FILE "};\n";
-close(FILE);
+# Add these UTF8->SJIS pairs to the table.
+push @$charset, (
+ {direction => "from_unicode", ucs => 0x00a2, code => 0x8191, comment => '# CENT SIGN'},
+ {direction => "from_unicode", ucs => 0x00a3, code => 0x8192, comment => '# POUND SIGN'},
+ {direction => "from_unicode", ucs => 0x00a5, code => 0x5c, comment => '# YEN SIGN'},
+ {direction => "from_unicode", ucs => 0x00ac, code => 0x81ca, comment => '# NOT SIGN'},
+ {direction => "from_unicode", ucs => 0x2016, code => 0x8161, comment => '# DOUBLE VERTICAL LINE'},
+ {direction => "from_unicode", ucs => 0x203e, code => 0x7e, comment => '# OVERLINE'},
+ {direction => "from_unicode", ucs => 0x2212, code => 0x817c, comment => '# MINUS SIGN'},
+ {direction => "from_unicode", ucs => 0x301c, code => 0x8160, comment => '# WAVE DASH'}
+);
+
+print_tables("SJIS", $charset);
diff --git a/src/backend/utils/mb/Unicode/UCS_to_UHC.pl b/src/backend/utils/mb/Unicode/UCS_to_UHC.pl
new file mode 100755
index 00000000000..b6bf3bd8f27
--- /dev/null
+++ b/src/backend/utils/mb/Unicode/UCS_to_UHC.pl
@@ -0,0 +1,51 @@
+#! /usr/bin/perl
+#
+# Copyright (c) 2007-2016, PostgreSQL Global Development Group
+#
+# src/backend/utils/mb/Unicode/UCS_to_GB18030.pl
+#
+# Generate UTF-8 <--> UHC code conversion tables from
+# "windows-949-2000.xml", obtained from
+# http://source.icu-project.org/repos/icu/data/trunk/charset/data/xml/
+#
+# The lines we care about in the source file look like
+# <a u="009A" b="81 30 83 36"/>
+# where the "u" field is the Unicode code point in hex,
+# and the "b" field is the hex byte sequence for UHC
+
+require "convutils.pm";
+
+# Read the input
+
+$in_file = "windows-949-2000.xml";
+
+open(FILE, $in_file) || die("cannot open $in_file");
+
+my @mapping;
+
+while (<FILE>)
+{
+ next if (!m/<a u="([0-9A-F]+)" b="([0-9A-F ]+)"/);
+ $u = $1;
+ $c = $2;
+ $c =~ s/ //g;
+ $ucs = hex($u);
+ $code = hex($c);
+
+ next if ($code == 0x0080 || $code == 0x00FF);
+
+ if ($code >= 0x80 && $ucs >= 0x0080)
+ {
+ push @mapping, {
+ ucs => $ucs,
+ code => $code,
+ direction => 'both'
+ }
+ }
+}
+close(FILE);
+
+# One extra character that's not in the source file.
+push @mapping, { direction => 'both', code => 0xa2e8, ucs => 0x327e, comment => 'CIRCLED HANGUL IEUNG U' };
+
+print_tables("UHC", \@mapping);
diff --git a/src/backend/utils/mb/Unicode/UCS_to_most.pl b/src/backend/utils/mb/Unicode/UCS_to_most.pl
index 125378f149a..a3cf436eefd 100755
--- a/src/backend/utils/mb/Unicode/UCS_to_most.pl
+++ b/src/backend/utils/mb/Unicode/UCS_to_most.pl
@@ -15,7 +15,7 @@
# UCS-2 code in hex
# # and Unicode name (not used in this script)
-require "ucs2utf.pl";
+require "convutils.pm";
%filename = (
'WIN866' => 'CP866.TXT',
@@ -44,121 +44,13 @@ require "ucs2utf.pl";
'ISO8859_16' => '8859-16.TXT',
'KOI8R' => 'KOI8-R.TXT',
'KOI8U' => 'KOI8-U.TXT',
- 'GBK' => 'CP936.TXT',
- 'UHC' => 'CP949.TXT',
- 'JOHAB' => 'JOHAB.TXT',);
+ 'GBK' => 'CP936.TXT');
@charsets = keys(%filename);
@charsets = @ARGV if scalar(@ARGV);
foreach $charset (@charsets)
{
+ my $mapping = &read_source($filename{$charset});
- #
- # first, generate UTF8-> charset table
- #
- $in_file = $filename{$charset};
-
- open(FILE, $in_file) || die("cannot open $in_file");
-
- reset 'array';
-
- while (<FILE>)
- {
- chop;
- if (/^#/)
- {
- next;
- }
- ($c, $u, $rest) = split;
- $ucs = hex($u);
- $code = hex($c);
- if ($code >= 0x80 && $ucs >= 0x0080)
- {
- $utf = &ucs2utf($ucs);
- if ($array{$utf} ne "")
- {
- printf STDERR "Warning: duplicate UTF8: %04x\n", $ucs;
- next;
- }
- $count++;
- $array{$utf} = $code;
- }
- }
- close(FILE);
-
- $file = lc("utf8_to_${charset}.map");
- open(FILE, "> $file") || die("cannot open $file");
-
- print FILE "/* src/backend/utils/mb/Unicode/$file */\n\n";
- print FILE "static const pg_utf_to_local ULmap${charset}[ $count ] = {\n";
-
- for $index (sort { $a <=> $b } keys(%array))
- {
- $code = $array{$index};
- $count--;
- if ($count == 0)
- {
- printf FILE " {0x%04x, 0x%04x}\n", $index, $code;
- }
- else
- {
- printf FILE " {0x%04x, 0x%04x},\n", $index, $code;
- }
- }
-
- print FILE "};\n";
- close(FILE);
-
- #
- # then generate character set code ->UTF8 table
- #
- open(FILE, $in_file) || die("cannot open $in_file");
-
- reset 'array';
-
- while (<FILE>)
- {
- chop;
- if (/^#/)
- {
- next;
- }
- ($c, $u, $rest) = split;
- $ucs = hex($u);
- $code = hex($c);
- if ($code >= 0x80 && $ucs >= 0x0080)
- {
- $utf = &ucs2utf($ucs);
- if ($array{$code} ne "")
- {
- printf STDERR "Warning: duplicate UTF8: %04x\n", $ucs;
- next;
- }
- $count++;
- $array{$code} = $utf;
- }
- }
- close(FILE);
-
- $file = lc("${charset}_to_utf8.map");
- open(FILE, "> $file") || die("cannot open $file");
-
- print FILE "/* src/backend/utils/mb/Unicode/$file */\n\n";
- print FILE "static const pg_local_to_utf LUmap${charset}[ $count ] = {\n";
- for $index (sort { $a <=> $b } keys(%array))
- {
- $utf = $array{$index};
- $count--;
- if ($count == 0)
- {
- printf FILE " {0x%04x, 0x%04x}\n", $index, $utf;
- }
- else
- {
- printf FILE " {0x%04x, 0x%04x},\n", $index, $utf;
- }
- }
-
- print FILE "};\n";
- close(FILE);
+ print_tables($charset, $mapping);
}
diff --git a/src/backend/utils/mb/Unicode/convutils.pm b/src/backend/utils/mb/Unicode/convutils.pm
new file mode 100644
index 00000000000..d6a13e8c02c
--- /dev/null
+++ b/src/backend/utils/mb/Unicode/convutils.pm
@@ -0,0 +1,282 @@
+#
+# Copyright (c) 2001-2016, PostgreSQL Global Development Group
+#
+# src/backend/utils/mb/Unicode/convutils.pm
+
+use strict;
+
+#######################################################################
+# convert UCS-4 to UTF-8
+#
+sub ucs2utf
+{
+ my ($ucs) = @_;
+ my $utf;
+
+ if ($ucs <= 0x007f)
+ {
+ $utf = $ucs;
+ }
+ elsif ($ucs > 0x007f && $ucs <= 0x07ff)
+ {
+ $utf = (($ucs & 0x003f) | 0x80) | ((($ucs >> 6) | 0xc0) << 8);
+ }
+ elsif ($ucs > 0x07ff && $ucs <= 0xffff)
+ {
+ $utf =
+ ((($ucs >> 12) | 0xe0) << 16) |
+ (((($ucs & 0x0fc0) >> 6) | 0x80) << 8) | (($ucs & 0x003f) | 0x80);
+ }
+ else
+ {
+ $utf =
+ ((($ucs >> 18) | 0xf0) << 24) |
+ (((($ucs & 0x3ffff) >> 12) | 0x80) << 16) |
+ (((($ucs & 0x0fc0) >> 6) | 0x80) << 8) | (($ucs & 0x003f) | 0x80);
+ }
+ return ($utf);
+}
+
+#######################################################################
+# read_source - common routine to read source file
+#
+# fname ; input file name
+sub read_source
+{
+ my ($fname) = @_;
+ my @r;
+
+ open(my $in, '<', $fname) || die("cannot open $fname");
+
+ while (<$in>)
+ {
+ next if (/^#/);
+ chop;
+
+ next if (/^$/); # Ignore empty lines
+
+ next if (/^0x([0-9A-F]+)\s+(#.*)$/);
+
+ # Skip the first column for JIS0208.TXT
+ if (!/^0x([0-9A-Fa-f]+)\s+0x([0-9A-Fa-f]+)\s+(?:0x([0-9A-Fa-f]+)\s+)?(#.*)$/)
+ {
+ print STDERR "READ ERROR at line $. in $fname: $_\n";
+ exit;
+ }
+ my $out = {f => $fname, l => $.,
+ code => hex($1),
+ ucs => hex($2),
+ comment => $4,
+ direction => "both"
+ };
+
+ # Ignore pure ASCII mappings. PostgreSQL character conversion code
+ # never even passes these to the conversion code.
+ next if ($out->{code} < 0x80 || $out->{ucs} < 0x80);
+
+ push(@r, $out);
+ }
+ close($in);
+
+ return \@r;
+}
+
+##################################################################
+# print_tables : output mapping tables
+#
+# Arguments:
+# charset - string name of the character set.
+# table - mapping table (see format below)
+# verbose - if 1, output comment on each line,
+# if 2, also output source file name and number
+#
+#
+#
+# Mapping table format:
+#
+# Mapping table is a list of hashes. Each hash has the following fields:
+# direction - Direction: 'both', 'from_unicode' or 'to_unicode'
+# ucs - Unicode code point
+# ucs_second - Second Unicode code point, if this is a "combined" character.
+# code - Byte sequence in the "other" character set, as an integer
+# comment - Text representation of the character
+# f - Source filename
+# l - Line number in source file
+#
+#
+sub print_tables
+{
+ my ($charset, $table, $verbose) = @_;
+
+ # Build an array with only the to-UTF8 direction mappings
+ my @to_unicode;
+ my @to_unicode_combined;
+ my @from_unicode;
+ my @from_unicode_combined;
+
+ foreach my $i (@$table)
+ {
+ if (defined $i->{ucs_second})
+ {
+ my $entry = {utf8 => ucs2utf($i->{ucs}),
+ utf8_second => ucs2utf($i->{ucs_second}),
+ code => $i->{code},
+ comment => $i->{comment},
+ f => $i->{f}, l => $i->{l}};
+ if ($i->{direction} eq "both" || $i->{direction} eq "to_unicode")
+ {
+ push @to_unicode_combined, $entry;
+ }
+ if ($i->{direction} eq "both" || $i->{direction} eq "from_unicode")
+ {
+ push @from_unicode_combined, $entry;
+ }
+ }
+ else
+ {
+ my $entry = {utf8 => ucs2utf($i->{ucs}),
+ code => $i->{code},
+ comment => $i->{comment},
+ f => $i->{f}, l => $i->{l}};
+ if ($i->{direction} eq "both" || $i->{direction} eq "to_unicode")
+ {
+ push @to_unicode, $entry;
+ }
+ if ($i->{direction} eq "both" || $i->{direction} eq "from_unicode")
+ {
+ push @from_unicode, $entry;
+ }
+ }
+ }
+
+ print_to_utf8_map($charset, \@to_unicode, $verbose);
+ print_to_utf8_combined_map($charset, \@to_unicode_combined, $verbose) if (scalar @to_unicode_combined > 0);
+ print_from_utf8_map($charset, \@from_unicode, $verbose);
+ print_from_utf8_combined_map($charset, \@from_unicode_combined, $verbose) if (scalar @from_unicode_combined > 0);
+}
+
+sub print_from_utf8_map
+{
+ my ($charset, $table, $verbose) = @_;
+
+ my $last_comment = "";
+
+ my $fname = lc("utf8_to_${charset}.map");
+ print "- Writing UTF8=>${charset} conversion table: $fname\n";
+ open(my $out, '>', $fname) || die "cannot open output file : $fname\n";
+ printf($out "/* src/backend/utils/mb/Unicode/$fname */\n\n".
+ "static const pg_utf_to_local ULmap${charset}[ %d ] = {",
+ scalar(@$table));
+ my $first = 1;
+ foreach my $i (sort {$$a{utf8} <=> $$b{utf8}} @$table)
+ {
+ print($out ",") if (!$first);
+ $first = 0;
+ print($out "\t/* $last_comment */") if ($verbose);
+
+ printf($out "\n {0x%04x, 0x%04x}", $$i{utf8}, $$i{code});
+ if ($verbose >= 2)
+ {
+ $last_comment = "$$i{f}:$$i{l} $$i{comment}";
+ }
+ else
+ {
+ $last_comment = $$i{comment};
+ }
+ }
+ print($out "\t/* $last_comment */") if ($verbose);
+ print $out "\n};\n";
+ close($out);
+}
+
+sub print_from_utf8_combined_map
+{
+ my ($charset, $table, $verbose) = @_;
+
+ my $last_comment = "";
+
+ my $fname = lc("utf8_to_${charset}_combined.map");
+ print "- Writing UTF8=>${charset} conversion table: $fname\n";
+ open(my $out, '>', $fname) || die "cannot open output file : $fname\n";
+ printf($out "/* src/backend/utils/mb/Unicode/$fname */\n\n".
+ "static const pg_utf_to_local_combined ULmap${charset}_combined[ %d ] = {",
+ scalar(@$table));
+ my $first = 1;
+ foreach my $i (sort {$$a{utf8} <=> $$b{utf8}} @$table)
+ {
+ print($out ",") if (!$first);
+ $first = 0;
+ print($out "\t/* $last_comment */") if ($verbose);
+
+ printf($out "\n {0x%08x, 0x%08x, 0x%04x}", $$i{utf8}, $$i{utf8_second}, $$i{code});
+ $last_comment = "$$i{comment}";
+ }
+ print($out "\t/* $last_comment */") if ($verbose);
+ print $out "\n};\n";
+ close($out);
+}
+
+sub print_to_utf8_map
+{
+ my ($charset, $table, $verbose) = @_;
+
+ my $last_comment = "";
+
+ my $fname = lc("${charset}_to_utf8.map");
+
+ print "- Writing ${charset}=>UTF8 conversion table: $fname\n";
+ open(my $out, '>', $fname) || die "cannot open output file : $fname\n";
+ printf($out "/* src/backend/utils/mb/Unicode/${fname} */\n\n".
+ "static const pg_local_to_utf LUmap${charset}[ %d ] = {",
+ scalar(@$table));
+ my $first = 1;
+ foreach my $i (sort {$$a{code} <=> $$b{code}} @$table)
+ {
+ print($out ",") if (!$first);
+ $first = 0;
+ print($out "\t/* $last_comment */") if ($verbose);
+
+ printf($out "\n {0x%04x, 0x%x}", $$i{code}, $$i{utf8});
+ if ($verbose >= 2)
+ {
+ $last_comment = "$$i{f}:$$i{l} $$i{comment}";
+ }
+ else
+ {
+ $last_comment = $$i{comment};
+ }
+ }
+ print($out "\t/* $last_comment */") if ($verbose);
+ print $out "\n};\n";
+ close($out);
+}
+
+sub print_to_utf8_combined_map
+{
+ my ($charset, $table, $verbose) = @_;
+
+ my $last_comment = "";
+
+ my $fname = lc("${charset}_to_utf8_combined.map");
+
+ print "- Writing ${charset}=>UTF8 conversion table: $fname\n";
+ open(my $out, '>', $fname) || die "cannot open output file : $fname\n";
+ printf($out "/* src/backend/utils/mb/Unicode/${fname} */\n\n".
+ "static const pg_local_to_utf_combined LUmap${charset}_combined[ %d ] = {",
+ scalar(@$table));
+ my $first = 1;
+ foreach my $i (sort {$$a{code} <=> $$b{code}} @$table)
+ {
+ print($out ",") if (!$first);
+ $first = 0;
+ print($out "\t/* $last_comment */") if ($verbose);
+
+ printf($out "\n {0x%04x, 0x%08x, 0x%08x}", $$i{code}, $$i{utf8}, $$i{utf8_second});
+ $last_comment = "$$i{comment}";
+ }
+ print($out "\t/* $last_comment */") if ($verbose);
+ print $out "\n};\n";
+ close($out);
+}
+
+1;
diff --git a/src/backend/utils/mb/Unicode/euc_jis_2004_to_utf8.map b/src/backend/utils/mb/Unicode/euc_jis_2004_to_utf8.map
index 2c3a607bf86..33fd42ac464 100644
--- a/src/backend/utils/mb/Unicode/euc_jis_2004_to_utf8.map
+++ b/src/backend/utils/mb/Unicode/euc_jis_2004_to_utf8.map
@@ -1,7 +1,6 @@
-/*
- * This file was generated by UCS_to_EUC_JIS_2004.pl
- */
-static const pg_local_to_utf LUmapEUC_JIS_2004[] = {
+/* src/backend/utils/mb/Unicode/euc_jis_2004_to_utf8.map */
+
+static const pg_local_to_utf LUmapEUC_JIS_2004[ 11303 ] = { /* */
{0x0080, 0xc280}, /* U+0080 <control> */
{0x0081, 0xc281}, /* U+0081 <control> */
{0x0082, 0xc282}, /* U+0082 <control> */
@@ -205,7 +204,7 @@ static const pg_local_to_utf LUmapEUC_JIS_2004[] = {
{0xa2ac, 0xe28691}, /* U+2191 UPWARDS ARROW */
{0xa2ad, 0xe28693}, /* U+2193 DOWNWARDS ARROW */
{0xa2ae, 0xe38093}, /* U+3013 GETA MARK */
- {0xa2af, 0xefbc87}, /* U+FF07 FULLWIDTH APOSTROPHE [2000] */
+ {0xa2af, 0xefbc87}, /* U+FF07 FULLWIDTH APOSTROPHE */
{0xa2b0, 0xefbc82}, /* U+FF02 FULLWIDTH QUOTATION MARK [2000] */
{0xa2b1, 0xefbc8d}, /* U+FF0D FULLWIDTH HYPHEN-MINUS [2000] */
{0xa2b2, 0xefbd9e}, /* U+FF5E FULLWIDTH TILDE [2000] */
diff --git a/src/backend/utils/mb/Unicode/euc_jis_2004_to_utf8_combined.map b/src/backend/utils/mb/Unicode/euc_jis_2004_to_utf8_combined.map
index 7a7f85b105d..2d8987b9908 100644
--- a/src/backend/utils/mb/Unicode/euc_jis_2004_to_utf8_combined.map
+++ b/src/backend/utils/mb/Unicode/euc_jis_2004_to_utf8_combined.map
@@ -1,7 +1,6 @@
-/*
- * This file was generated by UCS_to_EUC_JIS_2004.pl
- */
-static const pg_local_to_utf_combined LUmapEUC_JIS_2004_combined[] = {
+/* src/backend/utils/mb/Unicode/euc_jis_2004_to_utf8_combined.map */
+
+static const pg_local_to_utf_combined LUmapEUC_JIS_2004_combined[ 25 ] = { /* */
{0xa4f7, 0x00e3818b, 0x00e3829a}, /* U+304B+309A [2000] */
{0xa4f8, 0x00e3818d, 0x00e3829a}, /* U+304D+309A [2000] */
{0xa4f9, 0x00e3818f, 0x00e3829a}, /* U+304F+309A [2000] */
diff --git a/src/backend/utils/mb/Unicode/euc_jp_to_utf8.map b/src/backend/utils/mb/Unicode/euc_jp_to_utf8.map
index db427cbb24c..eb17f9829c5 100644
--- a/src/backend/utils/mb/Unicode/euc_jp_to_utf8.map
+++ b/src/backend/utils/mb/Unicode/euc_jp_to_utf8.map
@@ -1,6 +1,6 @@
/* src/backend/utils/mb/Unicode/euc_jp_to_utf8.map */
-static const pg_local_to_utf LUmapEUC_JP[] = {
+static const pg_local_to_utf LUmapEUC_JP[ 13197 ] = {
{0x8ea1, 0xefbda1},
{0x8ea2, 0xefbda2},
{0x8ea3, 0xefbda3},
@@ -13197,5 +13197,5 @@ static const pg_local_to_utf LUmapEUC_JP[] = {
{0x8ff4fb, 0xe9ab99},
{0x8ff4fc, 0xe9adb2},
{0x8ff4fd, 0xefa8ad},
- {0x8ff4fe, 0xe9bb91},
+ {0x8ff4fe, 0xe9bb91}
};
diff --git a/src/backend/utils/mb/Unicode/euc_kr_to_utf8.map b/src/backend/utils/mb/Unicode/euc_kr_to_utf8.map
index e37152137d6..701a7a476ff 100644
--- a/src/backend/utils/mb/Unicode/euc_kr_to_utf8.map
+++ b/src/backend/utils/mb/Unicode/euc_kr_to_utf8.map
@@ -1,3 +1,5 @@
+/* src/backend/utils/mb/Unicode/euc_kr_to_utf8.map */
+
static const pg_local_to_utf LUmapEUC_KR[ 8227 ] = {
{0xa1a1, 0xe38080},
{0xa1a2, 0xe38081},
diff --git a/src/backend/utils/mb/Unicode/johab_to_utf8.map b/src/backend/utils/mb/Unicode/johab_to_utf8.map
index 8110f6e8531..e31d24184c1 100644
--- a/src/backend/utils/mb/Unicode/johab_to_utf8.map
+++ b/src/backend/utils/mb/Unicode/johab_to_utf8.map
@@ -1,3 +1,5 @@
+/* src/backend/utils/mb/Unicode/johab_to_utf8.map */
+
static const pg_local_to_utf LUmapJOHAB[ 17049 ] = {
{0x8444, 0xe384b3},
{0x8446, 0xe384b5},
diff --git a/src/backend/utils/mb/Unicode/shift_jis_2004_to_utf8.map b/src/backend/utils/mb/Unicode/shift_jis_2004_to_utf8.map
index 81c898c6be4..958dde7b83d 100644
--- a/src/backend/utils/mb/Unicode/shift_jis_2004_to_utf8.map
+++ b/src/backend/utils/mb/Unicode/shift_jis_2004_to_utf8.map
@@ -1,7 +1,6 @@
-/*
- * This file was generated by UCS_to_SHIFTJIS_2004.pl
- */
-static const pg_local_to_utf LUmapSHIFT_JIS_2004[] = {
+/* src/backend/utils/mb/Unicode/shift_jis_2004_to_utf8.map */
+
+static const pg_local_to_utf LUmapSHIFT_JIS_2004[ 11271 ] = { /* */
{0x00a1, 0xefbda1}, /* U+FF61 HALFWIDTH IDEOGRAPHIC FULL STOP */
{0x00a2, 0xefbda2}, /* U+FF62 HALFWIDTH LEFT CORNER BRACKET */
{0x00a3, 0xefbda3}, /* U+FF63 HALFWIDTH RIGHT CORNER BRACKET */
@@ -173,7 +172,7 @@ static const pg_local_to_utf LUmapSHIFT_JIS_2004[] = {
{0x81aa, 0xe28691}, /* U+2191 UPWARDS ARROW */
{0x81ab, 0xe28693}, /* U+2193 DOWNWARDS ARROW */
{0x81ac, 0xe38093}, /* U+3013 GETA MARK */
- {0x81ad, 0xefbc87}, /* U+FF07 FULLWIDTH APOSTROPHE [2000] */
+ {0x81ad, 0xefbc87}, /* U+FF07 FULLWIDTH APOSTROPHE */
{0x81ae, 0xefbc82}, /* U+FF02 FULLWIDTH QUOTATION MARK [2000] */
{0x81af, 0xefbc8d}, /* U+FF0D FULLWIDTH HYPHEN-MINUS [2000] */
{0x81b0, 0x7e}, /* U+007E TILDE [2000] Fullwidth: U+FF5E */
diff --git a/src/backend/utils/mb/Unicode/shift_jis_2004_to_utf8_combined.map b/src/backend/utils/mb/Unicode/shift_jis_2004_to_utf8_combined.map
index b1c7bced5fd..414e59dc404 100644
--- a/src/backend/utils/mb/Unicode/shift_jis_2004_to_utf8_combined.map
+++ b/src/backend/utils/mb/Unicode/shift_jis_2004_to_utf8_combined.map
@@ -1,7 +1,6 @@
-/*
- * This file was generated by UCS_to_SHIFT_JIS_2004.pl
- */
-static const pg_local_to_utf_combined LUmapSHIFT_JIS_2004_combined[] = {
+/* src/backend/utils/mb/Unicode/shift_jis_2004_to_utf8_combined.map */
+
+static const pg_local_to_utf_combined LUmapSHIFT_JIS_2004_combined[ 25 ] = { /* */
{0x82f5, 0x00e3818b, 0x00e3829a}, /* U+304B+309A [2000] */
{0x82f6, 0x00e3818d, 0x00e3829a}, /* U+304D+309A [2000] */
{0x82f7, 0x00e3818f, 0x00e3829a}, /* U+304F+309A [2000] */
diff --git a/src/backend/utils/mb/Unicode/ucs2utf.pl b/src/backend/utils/mb/Unicode/ucs2utf.pl
deleted file mode 100644
index e0f1fb226fd..00000000000
--- a/src/backend/utils/mb/Unicode/ucs2utf.pl
+++ /dev/null
@@ -1,35 +0,0 @@
-#
-# Copyright (c) 2001-2016, PostgreSQL Global Development Group
-#
-# src/backend/utils/mb/Unicode/ucs2utf.pl
-# convert UCS-4 to UTF-8
-#
-sub ucs2utf
-{
- local ($ucs) = @_;
- local $utf;
-
- if ($ucs <= 0x007f)
- {
- $utf = $ucs;
- }
- elsif ($ucs > 0x007f && $ucs <= 0x07ff)
- {
- $utf = (($ucs & 0x003f) | 0x80) | ((($ucs >> 6) | 0xc0) << 8);
- }
- elsif ($ucs > 0x07ff && $ucs <= 0xffff)
- {
- $utf =
- ((($ucs >> 12) | 0xe0) << 16) |
- (((($ucs & 0x0fc0) >> 6) | 0x80) << 8) | (($ucs & 0x003f) | 0x80);
- }
- else
- {
- $utf =
- ((($ucs >> 18) | 0xf0) << 24) |
- (((($ucs & 0x3ffff) >> 12) | 0x80) << 16) |
- (((($ucs & 0x0fc0) >> 6) | 0x80) << 8) | (($ucs & 0x003f) | 0x80);
- }
- return ($utf);
-}
-1;
diff --git a/src/backend/utils/mb/Unicode/uhc_to_utf8.map b/src/backend/utils/mb/Unicode/uhc_to_utf8.map
index 26a7b18f658..65c7e114a3a 100644
--- a/src/backend/utils/mb/Unicode/uhc_to_utf8.map
+++ b/src/backend/utils/mb/Unicode/uhc_to_utf8.map
@@ -1,3 +1,5 @@
+/* src/backend/utils/mb/Unicode/uhc_to_utf8.map */
+
static const pg_local_to_utf LUmapUHC[ 17237 ] = {
{0x8141, 0xeab082},
{0x8142, 0xeab083},
diff --git a/src/backend/utils/mb/Unicode/utf8_to_euc_cn.map b/src/backend/utils/mb/Unicode/utf8_to_euc_cn.map
index b28eb9cc0c7..3d64cd1a604 100644
--- a/src/backend/utils/mb/Unicode/utf8_to_euc_cn.map
+++ b/src/backend/utils/mb/Unicode/utf8_to_euc_cn.map
@@ -1,3 +1,5 @@
+/* src/backend/utils/mb/Unicode/utf8_to_euc_cn.map */
+
static const pg_utf_to_local ULmapEUC_CN[ 7445 ] = {
{0xc2a4, 0xa1e8},
{0xc2a7, 0xa1ec},
diff --git a/src/backend/utils/mb/Unicode/utf8_to_euc_jis_2004.map b/src/backend/utils/mb/Unicode/utf8_to_euc_jis_2004.map
index 51372012176..b50e232b6ce 100644
--- a/src/backend/utils/mb/Unicode/utf8_to_euc_jis_2004.map
+++ b/src/backend/utils/mb/Unicode/utf8_to_euc_jis_2004.map
@@ -1,7 +1,6 @@
-/*
- * This file was generated by UCS_to_EUC_JIS_2004.pl
- */
-static const pg_utf_to_local ULmapEUC_JIS_2004[] = {
+/* src/backend/utils/mb/Unicode/utf8_to_euc_jis_2004.map */
+
+static const pg_utf_to_local ULmapEUC_JIS_2004[ 11303 ] = { /* */
{0xc280, 0x0080}, /* U+0080 <control> */
{0xc281, 0x0081}, /* U+0081 <control> */
{0xc282, 0x0082}, /* U+0082 <control> */
@@ -10849,7 +10848,7 @@ static const pg_utf_to_local ULmapEUC_JIS_2004[] = {
{0xefbc84, 0xa1f0}, /* U+FF04 FULLWIDTH DOLLAR SIGN */
{0xefbc85, 0xa1f3}, /* U+FF05 FULLWIDTH PERCENT SIGN */
{0xefbc86, 0xa1f5}, /* U+FF06 FULLWIDTH AMPERSAND */
- {0xefbc87, 0xa2af}, /* U+FF07 FULLWIDTH APOSTROPHE [2000] */
+ {0xefbc87, 0xa2af}, /* U+FF07 FULLWIDTH APOSTROPHE */
{0xefbc88, 0xa1ca}, /* U+FF08 FULLWIDTH LEFT PARENTHESIS */
{0xefbc89, 0xa1cb}, /* U+FF09 FULLWIDTH RIGHT PARENTHESIS */
{0xefbc8a, 0xa1f6}, /* U+FF0A FULLWIDTH ASTERISK */
diff --git a/src/backend/utils/mb/Unicode/utf8_to_euc_jis_2004_combined.map b/src/backend/utils/mb/Unicode/utf8_to_euc_jis_2004_combined.map
index d8ff5c05868..0d57667a558 100644
--- a/src/backend/utils/mb/Unicode/utf8_to_euc_jis_2004_combined.map
+++ b/src/backend/utils/mb/Unicode/utf8_to_euc_jis_2004_combined.map
@@ -1,7 +1,6 @@
-/*
- * This file was generated by UCS_to_EUC_JIS_2004.pl
- */
-static const pg_utf_to_local_combined ULmapEUC_JIS_2004_combined[] = {
+/* src/backend/utils/mb/Unicode/utf8_to_euc_jis_2004_combined.map */
+
+static const pg_utf_to_local_combined ULmapEUC_JIS_2004_combined[ 25 ] = { /* */
{0x0000c3a6, 0x0000cc80, 0xabc4}, /* U+00E6+0300 [2000] */
{0x0000c994, 0x0000cc80, 0xabc8}, /* U+0254+0300 [2000] */
{0x0000c994, 0x0000cc81, 0xabc9}, /* U+0254+0301 [2000] */
diff --git a/src/backend/utils/mb/Unicode/utf8_to_euc_jp.map b/src/backend/utils/mb/Unicode/utf8_to_euc_jp.map
index 137d4fdef61..eef6db65b34 100644
--- a/src/backend/utils/mb/Unicode/utf8_to_euc_jp.map
+++ b/src/backend/utils/mb/Unicode/utf8_to_euc_jp.map
@@ -1,3 +1,5 @@
+/* src/backend/utils/mb/Unicode/utf8_to_euc_jp.map */
+
static const pg_utf_to_local ULmapEUC_JP[ 13175 ] = {
{0xc2a1, 0x8fa2c2},
{0xc2a4, 0x8fa2f0},
diff --git a/src/backend/utils/mb/Unicode/utf8_to_euc_kr.map b/src/backend/utils/mb/Unicode/utf8_to_euc_kr.map
index 4a78b260ea4..a642b2154f2 100644
--- a/src/backend/utils/mb/Unicode/utf8_to_euc_kr.map
+++ b/src/backend/utils/mb/Unicode/utf8_to_euc_kr.map
@@ -1,3 +1,5 @@
+/* src/backend/utils/mb/Unicode/utf8_to_euc_kr.map */
+
static const pg_utf_to_local ULmapEUC_KR[ 8227 ] = {
{0xc2a1, 0xa2ae},
{0xc2a4, 0xa2b4},
diff --git a/src/backend/utils/mb/Unicode/utf8_to_johab.map b/src/backend/utils/mb/Unicode/utf8_to_johab.map
index 869f8213d21..78997d82d04 100644
--- a/src/backend/utils/mb/Unicode/utf8_to_johab.map
+++ b/src/backend/utils/mb/Unicode/utf8_to_johab.map
@@ -1,3 +1,5 @@
+/* src/backend/utils/mb/Unicode/utf8_to_johab.map */
+
static const pg_utf_to_local ULmapJOHAB[ 17049 ] = {
{0xc2a1, 0xd9ae},
{0xc2a4, 0xd9b4},
diff --git a/src/backend/utils/mb/Unicode/utf8_to_shift_jis_2004.map b/src/backend/utils/mb/Unicode/utf8_to_shift_jis_2004.map
index 4fab64fc956..e9f9e638c66 100644
--- a/src/backend/utils/mb/Unicode/utf8_to_shift_jis_2004.map
+++ b/src/backend/utils/mb/Unicode/utf8_to_shift_jis_2004.map
@@ -1,7 +1,6 @@
-/*
- * This file was generated by UCS_to_SHIFT_JIS_2004.pl
- */
-static const pg_utf_to_local ULmapSHIFT_JIS_2004[] = {
+/* src/backend/utils/mb/Unicode/utf8_to_shift_jis_2004.map */
+
+static const pg_utf_to_local ULmapSHIFT_JIS_2004[ 11271 ] = { /* */
{0xc2a0, 0x8541}, /* U+00A0 NO-BREAK SPACE [2000] */
{0xc2a1, 0x8542}, /* U+00A1 INVERTED EXCLAMATION MARK [2000] */
{0xc2a2, 0x8191}, /* U+00A2 CENT SIGN Windows: U+FFE0 */
@@ -10817,7 +10816,7 @@ static const pg_utf_to_local ULmapSHIFT_JIS_2004[] = {
{0xefbc84, 0x8190}, /* U+FF04 FULLWIDTH DOLLAR SIGN */
{0xefbc85, 0x8193}, /* U+FF05 FULLWIDTH PERCENT SIGN */
{0xefbc86, 0x8195}, /* U+FF06 FULLWIDTH AMPERSAND */
- {0xefbc87, 0x81ad}, /* U+FF07 FULLWIDTH APOSTROPHE [2000] */
+ {0xefbc87, 0x81ad}, /* U+FF07 FULLWIDTH APOSTROPHE */
{0xefbc88, 0x8169}, /* U+FF08 FULLWIDTH LEFT PARENTHESIS */
{0xefbc89, 0x816a}, /* U+FF09 FULLWIDTH RIGHT PARENTHESIS */
{0xefbc8a, 0x8196}, /* U+FF0A FULLWIDTH ASTERISK */
diff --git a/src/backend/utils/mb/Unicode/utf8_to_shift_jis_2004_combined.map b/src/backend/utils/mb/Unicode/utf8_to_shift_jis_2004_combined.map
index e55d4a2a6cf..3642851fd6a 100644
--- a/src/backend/utils/mb/Unicode/utf8_to_shift_jis_2004_combined.map
+++ b/src/backend/utils/mb/Unicode/utf8_to_shift_jis_2004_combined.map
@@ -1,7 +1,6 @@
-/*
- * This file was generated by UCS_to_SHIFT_JIS_2004.pl
- */
-static const pg_utf_to_local_combined ULmapSHIFT_JIS_2004_combined[] = {
+/* src/backend/utils/mb/Unicode/utf8_to_shift_jis_2004_combined.map */
+
+static const pg_utf_to_local_combined ULmapSHIFT_JIS_2004_combined[ 25 ] = { /* */
{0x0000c3a6, 0x0000cc80, 0x8663}, /* U+00E6+0300 [2000] */
{0x0000c994, 0x0000cc80, 0x8667}, /* U+0254+0300 [2000] */
{0x0000c994, 0x0000cc81, 0x8668}, /* U+0254+0301 [2000] */
diff --git a/src/backend/utils/mb/Unicode/utf8_to_sjis.map b/src/backend/utils/mb/Unicode/utf8_to_sjis.map
index fb0566a1db0..cd6ea48ffc3 100644
--- a/src/backend/utils/mb/Unicode/utf8_to_sjis.map
+++ b/src/backend/utils/mb/Unicode/utf8_to_sjis.map
@@ -3,7 +3,7 @@
static const pg_utf_to_local ULmapSJIS[ 7397 ] = {
{0xc2a2, 0x8191},
{0xc2a3, 0x8192},
- {0xc2a5, 0x5c},
+ {0xc2a5, 0x005c},
{0xc2a7, 0x8198},
{0xc2a8, 0x814e},
{0xc2ac, 0x81ca},
@@ -142,7 +142,7 @@ static const pg_utf_to_local ULmapSJIS[ 7397 ] = {
{0xe280b2, 0x818c},
{0xe280b3, 0x818d},
{0xe280bb, 0x81a6},
- {0xe280be, 0x7e},
+ {0xe280be, 0x007e},
{0xe28483, 0x818e},
{0xe28496, 0xfa59},
{0xe284a1, 0xfa5a},
diff --git a/src/backend/utils/mb/Unicode/utf8_to_uhc.map b/src/backend/utils/mb/Unicode/utf8_to_uhc.map
index 15dfb56a099..dc04726364a 100644
--- a/src/backend/utils/mb/Unicode/utf8_to_uhc.map
+++ b/src/backend/utils/mb/Unicode/utf8_to_uhc.map
@@ -1,3 +1,5 @@
+/* src/backend/utils/mb/Unicode/utf8_to_uhc.map */
+
static const pg_utf_to_local ULmapUHC[ 17237 ] = {
{0xc2a1, 0xa2ae},
{0xc2a4, 0xa2b4},