aboutsummaryrefslogtreecommitdiff
path: root/src/backend/utils/mb/Unicode/convutils.pm
diff options
context:
space:
mode:
Diffstat (limited to 'src/backend/utils/mb/Unicode/convutils.pm')
-rw-r--r--src/backend/utils/mb/Unicode/convutils.pm282
1 files changed, 282 insertions, 0 deletions
diff --git a/src/backend/utils/mb/Unicode/convutils.pm b/src/backend/utils/mb/Unicode/convutils.pm
new file mode 100644
index 00000000000..d6a13e8c02c
--- /dev/null
+++ b/src/backend/utils/mb/Unicode/convutils.pm
@@ -0,0 +1,282 @@
+#
+# Copyright (c) 2001-2016, PostgreSQL Global Development Group
+#
+# src/backend/utils/mb/Unicode/convutils.pm
+
+use strict;
+
+#######################################################################
+# convert UCS-4 to UTF-8
+#
+sub ucs2utf
+{
+ my ($ucs) = @_;
+ my $utf;
+
+ if ($ucs <= 0x007f)
+ {
+ $utf = $ucs;
+ }
+ elsif ($ucs > 0x007f && $ucs <= 0x07ff)
+ {
+ $utf = (($ucs & 0x003f) | 0x80) | ((($ucs >> 6) | 0xc0) << 8);
+ }
+ elsif ($ucs > 0x07ff && $ucs <= 0xffff)
+ {
+ $utf =
+ ((($ucs >> 12) | 0xe0) << 16) |
+ (((($ucs & 0x0fc0) >> 6) | 0x80) << 8) | (($ucs & 0x003f) | 0x80);
+ }
+ else
+ {
+ $utf =
+ ((($ucs >> 18) | 0xf0) << 24) |
+ (((($ucs & 0x3ffff) >> 12) | 0x80) << 16) |
+ (((($ucs & 0x0fc0) >> 6) | 0x80) << 8) | (($ucs & 0x003f) | 0x80);
+ }
+ return ($utf);
+}
+
+#######################################################################
+# read_source - common routine to read source file
+#
+# fname ; input file name
+sub read_source
+{
+ my ($fname) = @_;
+ my @r;
+
+ open(my $in, '<', $fname) || die("cannot open $fname");
+
+ while (<$in>)
+ {
+ next if (/^#/);
+ chop;
+
+ next if (/^$/); # Ignore empty lines
+
+ next if (/^0x([0-9A-F]+)\s+(#.*)$/);
+
+ # Skip the first column for JIS0208.TXT
+ if (!/^0x([0-9A-Fa-f]+)\s+0x([0-9A-Fa-f]+)\s+(?:0x([0-9A-Fa-f]+)\s+)?(#.*)$/)
+ {
+ print STDERR "READ ERROR at line $. in $fname: $_\n";
+ exit;
+ }
+ my $out = {f => $fname, l => $.,
+ code => hex($1),
+ ucs => hex($2),
+ comment => $4,
+ direction => "both"
+ };
+
+ # Ignore pure ASCII mappings. PostgreSQL character conversion code
+ # never even passes these to the conversion code.
+ next if ($out->{code} < 0x80 || $out->{ucs} < 0x80);
+
+ push(@r, $out);
+ }
+ close($in);
+
+ return \@r;
+}
+
+##################################################################
+# print_tables : output mapping tables
+#
+# Arguments:
+# charset - string name of the character set.
+# table - mapping table (see format below)
+# verbose - if 1, output comment on each line,
+# if 2, also output source file name and number
+#
+#
+#
+# Mapping table format:
+#
+# Mapping table is a list of hashes. Each hash has the following fields:
+# direction - Direction: 'both', 'from_unicode' or 'to_unicode'
+# ucs - Unicode code point
+# ucs_second - Second Unicode code point, if this is a "combined" character.
+# code - Byte sequence in the "other" character set, as an integer
+# comment - Text representation of the character
+# f - Source filename
+# l - Line number in source file
+#
+#
+sub print_tables
+{
+ my ($charset, $table, $verbose) = @_;
+
+ # Build an array with only the to-UTF8 direction mappings
+ my @to_unicode;
+ my @to_unicode_combined;
+ my @from_unicode;
+ my @from_unicode_combined;
+
+ foreach my $i (@$table)
+ {
+ if (defined $i->{ucs_second})
+ {
+ my $entry = {utf8 => ucs2utf($i->{ucs}),
+ utf8_second => ucs2utf($i->{ucs_second}),
+ code => $i->{code},
+ comment => $i->{comment},
+ f => $i->{f}, l => $i->{l}};
+ if ($i->{direction} eq "both" || $i->{direction} eq "to_unicode")
+ {
+ push @to_unicode_combined, $entry;
+ }
+ if ($i->{direction} eq "both" || $i->{direction} eq "from_unicode")
+ {
+ push @from_unicode_combined, $entry;
+ }
+ }
+ else
+ {
+ my $entry = {utf8 => ucs2utf($i->{ucs}),
+ code => $i->{code},
+ comment => $i->{comment},
+ f => $i->{f}, l => $i->{l}};
+ if ($i->{direction} eq "both" || $i->{direction} eq "to_unicode")
+ {
+ push @to_unicode, $entry;
+ }
+ if ($i->{direction} eq "both" || $i->{direction} eq "from_unicode")
+ {
+ push @from_unicode, $entry;
+ }
+ }
+ }
+
+ print_to_utf8_map($charset, \@to_unicode, $verbose);
+ print_to_utf8_combined_map($charset, \@to_unicode_combined, $verbose) if (scalar @to_unicode_combined > 0);
+ print_from_utf8_map($charset, \@from_unicode, $verbose);
+ print_from_utf8_combined_map($charset, \@from_unicode_combined, $verbose) if (scalar @from_unicode_combined > 0);
+}
+
+sub print_from_utf8_map
+{
+ my ($charset, $table, $verbose) = @_;
+
+ my $last_comment = "";
+
+ my $fname = lc("utf8_to_${charset}.map");
+ print "- Writing UTF8=>${charset} conversion table: $fname\n";
+ open(my $out, '>', $fname) || die "cannot open output file : $fname\n";
+ printf($out "/* src/backend/utils/mb/Unicode/$fname */\n\n".
+ "static const pg_utf_to_local ULmap${charset}[ %d ] = {",
+ scalar(@$table));
+ my $first = 1;
+ foreach my $i (sort {$$a{utf8} <=> $$b{utf8}} @$table)
+ {
+ print($out ",") if (!$first);
+ $first = 0;
+ print($out "\t/* $last_comment */") if ($verbose);
+
+ printf($out "\n {0x%04x, 0x%04x}", $$i{utf8}, $$i{code});
+ if ($verbose >= 2)
+ {
+ $last_comment = "$$i{f}:$$i{l} $$i{comment}";
+ }
+ else
+ {
+ $last_comment = $$i{comment};
+ }
+ }
+ print($out "\t/* $last_comment */") if ($verbose);
+ print $out "\n};\n";
+ close($out);
+}
+
+sub print_from_utf8_combined_map
+{
+ my ($charset, $table, $verbose) = @_;
+
+ my $last_comment = "";
+
+ my $fname = lc("utf8_to_${charset}_combined.map");
+ print "- Writing UTF8=>${charset} conversion table: $fname\n";
+ open(my $out, '>', $fname) || die "cannot open output file : $fname\n";
+ printf($out "/* src/backend/utils/mb/Unicode/$fname */\n\n".
+ "static const pg_utf_to_local_combined ULmap${charset}_combined[ %d ] = {",
+ scalar(@$table));
+ my $first = 1;
+ foreach my $i (sort {$$a{utf8} <=> $$b{utf8}} @$table)
+ {
+ print($out ",") if (!$first);
+ $first = 0;
+ print($out "\t/* $last_comment */") if ($verbose);
+
+ printf($out "\n {0x%08x, 0x%08x, 0x%04x}", $$i{utf8}, $$i{utf8_second}, $$i{code});
+ $last_comment = "$$i{comment}";
+ }
+ print($out "\t/* $last_comment */") if ($verbose);
+ print $out "\n};\n";
+ close($out);
+}
+
+sub print_to_utf8_map
+{
+ my ($charset, $table, $verbose) = @_;
+
+ my $last_comment = "";
+
+ my $fname = lc("${charset}_to_utf8.map");
+
+ print "- Writing ${charset}=>UTF8 conversion table: $fname\n";
+ open(my $out, '>', $fname) || die "cannot open output file : $fname\n";
+ printf($out "/* src/backend/utils/mb/Unicode/${fname} */\n\n".
+ "static const pg_local_to_utf LUmap${charset}[ %d ] = {",
+ scalar(@$table));
+ my $first = 1;
+ foreach my $i (sort {$$a{code} <=> $$b{code}} @$table)
+ {
+ print($out ",") if (!$first);
+ $first = 0;
+ print($out "\t/* $last_comment */") if ($verbose);
+
+ printf($out "\n {0x%04x, 0x%x}", $$i{code}, $$i{utf8});
+ if ($verbose >= 2)
+ {
+ $last_comment = "$$i{f}:$$i{l} $$i{comment}";
+ }
+ else
+ {
+ $last_comment = $$i{comment};
+ }
+ }
+ print($out "\t/* $last_comment */") if ($verbose);
+ print $out "\n};\n";
+ close($out);
+}
+
+sub print_to_utf8_combined_map
+{
+ my ($charset, $table, $verbose) = @_;
+
+ my $last_comment = "";
+
+ my $fname = lc("${charset}_to_utf8_combined.map");
+
+ print "- Writing ${charset}=>UTF8 conversion table: $fname\n";
+ open(my $out, '>', $fname) || die "cannot open output file : $fname\n";
+ printf($out "/* src/backend/utils/mb/Unicode/${fname} */\n\n".
+ "static const pg_local_to_utf_combined LUmap${charset}_combined[ %d ] = {",
+ scalar(@$table));
+ my $first = 1;
+ foreach my $i (sort {$$a{code} <=> $$b{code}} @$table)
+ {
+ print($out ",") if (!$first);
+ $first = 0;
+ print($out "\t/* $last_comment */") if ($verbose);
+
+ printf($out "\n {0x%04x, 0x%08x, 0x%08x}", $$i{code}, $$i{utf8}, $$i{utf8_second});
+ $last_comment = "$$i{comment}";
+ }
+ print($out "\t/* $last_comment */") if ($verbose);
+ print $out "\n};\n";
+ close($out);
+}
+
+1;