diff options
Diffstat (limited to 'src/backend/utils/mb/Unicode/convutils.pm')
-rw-r--r-- | src/backend/utils/mb/Unicode/convutils.pm | 282 |
1 files changed, 282 insertions, 0 deletions
diff --git a/src/backend/utils/mb/Unicode/convutils.pm b/src/backend/utils/mb/Unicode/convutils.pm new file mode 100644 index 00000000000..d6a13e8c02c --- /dev/null +++ b/src/backend/utils/mb/Unicode/convutils.pm @@ -0,0 +1,282 @@ +# +# Copyright (c) 2001-2016, PostgreSQL Global Development Group +# +# src/backend/utils/mb/Unicode/convutils.pm + +use strict; + +####################################################################### +# convert UCS-4 to UTF-8 +# +sub ucs2utf +{ + my ($ucs) = @_; + my $utf; + + if ($ucs <= 0x007f) + { + $utf = $ucs; + } + elsif ($ucs > 0x007f && $ucs <= 0x07ff) + { + $utf = (($ucs & 0x003f) | 0x80) | ((($ucs >> 6) | 0xc0) << 8); + } + elsif ($ucs > 0x07ff && $ucs <= 0xffff) + { + $utf = + ((($ucs >> 12) | 0xe0) << 16) | + (((($ucs & 0x0fc0) >> 6) | 0x80) << 8) | (($ucs & 0x003f) | 0x80); + } + else + { + $utf = + ((($ucs >> 18) | 0xf0) << 24) | + (((($ucs & 0x3ffff) >> 12) | 0x80) << 16) | + (((($ucs & 0x0fc0) >> 6) | 0x80) << 8) | (($ucs & 0x003f) | 0x80); + } + return ($utf); +} + +####################################################################### +# read_source - common routine to read source file +# +# fname ; input file name +sub read_source +{ + my ($fname) = @_; + my @r; + + open(my $in, '<', $fname) || die("cannot open $fname"); + + while (<$in>) + { + next if (/^#/); + chop; + + next if (/^$/); # Ignore empty lines + + next if (/^0x([0-9A-F]+)\s+(#.*)$/); + + # Skip the first column for JIS0208.TXT + if (!/^0x([0-9A-Fa-f]+)\s+0x([0-9A-Fa-f]+)\s+(?:0x([0-9A-Fa-f]+)\s+)?(#.*)$/) + { + print STDERR "READ ERROR at line $. in $fname: $_\n"; + exit; + } + my $out = {f => $fname, l => $., + code => hex($1), + ucs => hex($2), + comment => $4, + direction => "both" + }; + + # Ignore pure ASCII mappings. PostgreSQL character conversion code + # never even passes these to the conversion code. + next if ($out->{code} < 0x80 || $out->{ucs} < 0x80); + + push(@r, $out); + } + close($in); + + return \@r; +} + +################################################################## +# print_tables : output mapping tables +# +# Arguments: +# charset - string name of the character set. +# table - mapping table (see format below) +# verbose - if 1, output comment on each line, +# if 2, also output source file name and number +# +# +# +# Mapping table format: +# +# Mapping table is a list of hashes. Each hash has the following fields: +# direction - Direction: 'both', 'from_unicode' or 'to_unicode' +# ucs - Unicode code point +# ucs_second - Second Unicode code point, if this is a "combined" character. +# code - Byte sequence in the "other" character set, as an integer +# comment - Text representation of the character +# f - Source filename +# l - Line number in source file +# +# +sub print_tables +{ + my ($charset, $table, $verbose) = @_; + + # Build an array with only the to-UTF8 direction mappings + my @to_unicode; + my @to_unicode_combined; + my @from_unicode; + my @from_unicode_combined; + + foreach my $i (@$table) + { + if (defined $i->{ucs_second}) + { + my $entry = {utf8 => ucs2utf($i->{ucs}), + utf8_second => ucs2utf($i->{ucs_second}), + code => $i->{code}, + comment => $i->{comment}, + f => $i->{f}, l => $i->{l}}; + if ($i->{direction} eq "both" || $i->{direction} eq "to_unicode") + { + push @to_unicode_combined, $entry; + } + if ($i->{direction} eq "both" || $i->{direction} eq "from_unicode") + { + push @from_unicode_combined, $entry; + } + } + else + { + my $entry = {utf8 => ucs2utf($i->{ucs}), + code => $i->{code}, + comment => $i->{comment}, + f => $i->{f}, l => $i->{l}}; + if ($i->{direction} eq "both" || $i->{direction} eq "to_unicode") + { + push @to_unicode, $entry; + } + if ($i->{direction} eq "both" || $i->{direction} eq "from_unicode") + { + push @from_unicode, $entry; + } + } + } + + print_to_utf8_map($charset, \@to_unicode, $verbose); + print_to_utf8_combined_map($charset, \@to_unicode_combined, $verbose) if (scalar @to_unicode_combined > 0); + print_from_utf8_map($charset, \@from_unicode, $verbose); + print_from_utf8_combined_map($charset, \@from_unicode_combined, $verbose) if (scalar @from_unicode_combined > 0); +} + +sub print_from_utf8_map +{ + my ($charset, $table, $verbose) = @_; + + my $last_comment = ""; + + my $fname = lc("utf8_to_${charset}.map"); + print "- Writing UTF8=>${charset} conversion table: $fname\n"; + open(my $out, '>', $fname) || die "cannot open output file : $fname\n"; + printf($out "/* src/backend/utils/mb/Unicode/$fname */\n\n". + "static const pg_utf_to_local ULmap${charset}[ %d ] = {", + scalar(@$table)); + my $first = 1; + foreach my $i (sort {$$a{utf8} <=> $$b{utf8}} @$table) + { + print($out ",") if (!$first); + $first = 0; + print($out "\t/* $last_comment */") if ($verbose); + + printf($out "\n {0x%04x, 0x%04x}", $$i{utf8}, $$i{code}); + if ($verbose >= 2) + { + $last_comment = "$$i{f}:$$i{l} $$i{comment}"; + } + else + { + $last_comment = $$i{comment}; + } + } + print($out "\t/* $last_comment */") if ($verbose); + print $out "\n};\n"; + close($out); +} + +sub print_from_utf8_combined_map +{ + my ($charset, $table, $verbose) = @_; + + my $last_comment = ""; + + my $fname = lc("utf8_to_${charset}_combined.map"); + print "- Writing UTF8=>${charset} conversion table: $fname\n"; + open(my $out, '>', $fname) || die "cannot open output file : $fname\n"; + printf($out "/* src/backend/utils/mb/Unicode/$fname */\n\n". + "static const pg_utf_to_local_combined ULmap${charset}_combined[ %d ] = {", + scalar(@$table)); + my $first = 1; + foreach my $i (sort {$$a{utf8} <=> $$b{utf8}} @$table) + { + print($out ",") if (!$first); + $first = 0; + print($out "\t/* $last_comment */") if ($verbose); + + printf($out "\n {0x%08x, 0x%08x, 0x%04x}", $$i{utf8}, $$i{utf8_second}, $$i{code}); + $last_comment = "$$i{comment}"; + } + print($out "\t/* $last_comment */") if ($verbose); + print $out "\n};\n"; + close($out); +} + +sub print_to_utf8_map +{ + my ($charset, $table, $verbose) = @_; + + my $last_comment = ""; + + my $fname = lc("${charset}_to_utf8.map"); + + print "- Writing ${charset}=>UTF8 conversion table: $fname\n"; + open(my $out, '>', $fname) || die "cannot open output file : $fname\n"; + printf($out "/* src/backend/utils/mb/Unicode/${fname} */\n\n". + "static const pg_local_to_utf LUmap${charset}[ %d ] = {", + scalar(@$table)); + my $first = 1; + foreach my $i (sort {$$a{code} <=> $$b{code}} @$table) + { + print($out ",") if (!$first); + $first = 0; + print($out "\t/* $last_comment */") if ($verbose); + + printf($out "\n {0x%04x, 0x%x}", $$i{code}, $$i{utf8}); + if ($verbose >= 2) + { + $last_comment = "$$i{f}:$$i{l} $$i{comment}"; + } + else + { + $last_comment = $$i{comment}; + } + } + print($out "\t/* $last_comment */") if ($verbose); + print $out "\n};\n"; + close($out); +} + +sub print_to_utf8_combined_map +{ + my ($charset, $table, $verbose) = @_; + + my $last_comment = ""; + + my $fname = lc("${charset}_to_utf8_combined.map"); + + print "- Writing ${charset}=>UTF8 conversion table: $fname\n"; + open(my $out, '>', $fname) || die "cannot open output file : $fname\n"; + printf($out "/* src/backend/utils/mb/Unicode/${fname} */\n\n". + "static const pg_local_to_utf_combined LUmap${charset}_combined[ %d ] = {", + scalar(@$table)); + my $first = 1; + foreach my $i (sort {$$a{code} <=> $$b{code}} @$table) + { + print($out ",") if (!$first); + $first = 0; + print($out "\t/* $last_comment */") if ($verbose); + + printf($out "\n {0x%04x, 0x%08x, 0x%08x}", $$i{code}, $$i{utf8}, $$i{utf8_second}); + $last_comment = "$$i{comment}"; + } + print($out "\t/* $last_comment */") if ($verbose); + print $out "\n};\n"; + close($out); +} + +1; |