aboutsummaryrefslogtreecommitdiff
path: root/src/backend/utils/mb/Unicode/UCS_to_UHC.pl
diff options
context:
space:
mode:
authorHeikki Linnakangas <heikki.linnakangas@iki.fi>2016-11-30 14:54:02 +0200
committerHeikki Linnakangas <heikki.linnakangas@iki.fi>2016-11-30 14:54:52 +0200
commit1de9cc0dcca649d1900720924f4ea5c430d1a51e (patch)
tree5815918e2c884c77b48ce75a715f628e0fd2777c /src/backend/utils/mb/Unicode/UCS_to_UHC.pl
parent6c303223be34329bae2f03a87590ffa0742a65f6 (diff)
downloadpostgresql-1de9cc0dcca649d1900720924f4ea5c430d1a51e.tar.gz
postgresql-1de9cc0dcca649d1900720924f4ea5c430d1a51e.zip
Rewrite the perl scripts to produce our Unicode conversion tables.
Generate EUC_CN mappings from gb-18030-2000.xml, because GB2312.TXT is no longer available. Get UHC from windows-949-2000.xml, it's more up-to-date. Plus tons more small changes. With these changes, the perl scripts faithfully produce the *.map files we have in the repository, from the external source files. In the passing, fix the Makefile to also download CP932.TXT and CP950.TXT. Based on patches by Kyotaro Horiguchi, reviewed by Daniel Gustafsson. Discussion: https://postgr.es/m/08e7892a-d55c-eefe-76e6-7910bc8dd1f3@iki.fi
Diffstat (limited to 'src/backend/utils/mb/Unicode/UCS_to_UHC.pl')
-rwxr-xr-xsrc/backend/utils/mb/Unicode/UCS_to_UHC.pl51
1 files changed, 51 insertions, 0 deletions
diff --git a/src/backend/utils/mb/Unicode/UCS_to_UHC.pl b/src/backend/utils/mb/Unicode/UCS_to_UHC.pl
new file mode 100755
index 00000000000..b6bf3bd8f27
--- /dev/null
+++ b/src/backend/utils/mb/Unicode/UCS_to_UHC.pl
@@ -0,0 +1,51 @@
+#! /usr/bin/perl
+#
+# Copyright (c) 2007-2016, PostgreSQL Global Development Group
+#
+# src/backend/utils/mb/Unicode/UCS_to_GB18030.pl
+#
+# Generate UTF-8 <--> UHC code conversion tables from
+# "windows-949-2000.xml", obtained from
+# http://source.icu-project.org/repos/icu/data/trunk/charset/data/xml/
+#
+# The lines we care about in the source file look like
+# <a u="009A" b="81 30 83 36"/>
+# where the "u" field is the Unicode code point in hex,
+# and the "b" field is the hex byte sequence for UHC
+
+require "convutils.pm";
+
+# Read the input
+
+$in_file = "windows-949-2000.xml";
+
+open(FILE, $in_file) || die("cannot open $in_file");
+
+my @mapping;
+
+while (<FILE>)
+{
+ next if (!m/<a u="([0-9A-F]+)" b="([0-9A-F ]+)"/);
+ $u = $1;
+ $c = $2;
+ $c =~ s/ //g;
+ $ucs = hex($u);
+ $code = hex($c);
+
+ next if ($code == 0x0080 || $code == 0x00FF);
+
+ if ($code >= 0x80 && $ucs >= 0x0080)
+ {
+ push @mapping, {
+ ucs => $ucs,
+ code => $code,
+ direction => 'both'
+ }
+ }
+}
+close(FILE);
+
+# One extra character that's not in the source file.
+push @mapping, { direction => 'both', code => 0xa2e8, ucs => 0x327e, comment => 'CIRCLED HANGUL IEUNG U' };
+
+print_tables("UHC", \@mapping);