diff options
Diffstat (limited to 'src/backend/utils/mb/Unicode/convutils.pm')
-rw-r--r-- | src/backend/utils/mb/Unicode/convutils.pm | 806 |
1 files changed, 656 insertions, 150 deletions
diff --git a/src/backend/utils/mb/Unicode/convutils.pm b/src/backend/utils/mb/Unicode/convutils.pm index eb3c602c32d..6bd84712b05 100644 --- a/src/backend/utils/mb/Unicode/convutils.pm +++ b/src/backend/utils/mb/Unicode/convutils.pm @@ -3,44 +3,27 @@ # # src/backend/utils/mb/Unicode/convutils.pm +package convutils; + use strict; -####################################################################### -# convert UCS-4 to UTF-8 -# -sub ucs2utf -{ - my ($ucs) = @_; - my $utf; +use Exporter 'import'; - if ($ucs <= 0x007f) - { - $utf = $ucs; - } - elsif ($ucs > 0x007f && $ucs <= 0x07ff) - { - $utf = (($ucs & 0x003f) | 0x80) | ((($ucs >> 6) | 0xc0) << 8); - } - elsif ($ucs > 0x07ff && $ucs <= 0xffff) - { - $utf = - ((($ucs >> 12) | 0xe0) << 16) | - (((($ucs & 0x0fc0) >> 6) | 0x80) << 8) | (($ucs & 0x003f) | 0x80); - } - else - { - $utf = - ((($ucs >> 18) | 0xf0) << 24) | - (((($ucs & 0x3ffff) >> 12) | 0x80) << 16) | - (((($ucs & 0x0fc0) >> 6) | 0x80) << 8) | (($ucs & 0x003f) | 0x80); - } - return ($utf); -} +our @EXPORT = qw( NONE TO_UNICODE FROM_UNICODE BOTH read_source print_conversion_tables); + +# Constants used in the 'direction' field of the character maps +use constant { + NONE => 0, + TO_UNICODE => 1, + FROM_UNICODE => 2, + BOTH => 3 +}; ####################################################################### # read_source - common routine to read source file # # fname ; input file name +# sub read_source { my ($fname) = @_; @@ -70,7 +53,9 @@ sub read_source code => hex($1), ucs => hex($2), comment => $4, - direction => "both" + direction => BOTH, + f => $fname, + l => $. }; # Ignore pure ASCII mappings. PostgreSQL character conversion code @@ -85,20 +70,18 @@ sub read_source } ################################################################## -# print_tables : output mapping tables +# print_conversion_tables - output mapping tables # -# Arguments: -# charset - string name of the character set. -# table - mapping table (see format below) -# verbose - if 1, output comment on each line, -# if 2, also output source file name and number +# print_conversion_tables($this_script, $csname, \%charset) # +# this_script - the name of the *caller script* of this feature +# csname - character set name other than ucs +# charset - ref to character set array # +# Input character set array format: # -# Mapping table format: -# -# Mapping table is a list of hashes. Each hash has the following fields: -# direction - Direction: 'both', 'from_unicode' or 'to_unicode' +# Each element in the character set array is a hash. Each hash has the following fields: +# direction - BOTH, TO_UNICODE, or FROM_UNICODE (or NONE, to ignore the entry altogether) # ucs - Unicode code point # ucs_second - Second Unicode code point, if this is a "combined" character. # code - Byte sequence in the "other" character set, as an integer @@ -106,180 +89,703 @@ sub read_source # f - Source filename # l - Line number in source file # +sub print_conversion_tables +{ + my ($this_script, $csname, $charset) = @_; + + print_conversion_tables_direction($this_script, $csname, FROM_UNICODE, $charset); + print_conversion_tables_direction($this_script, $csname, TO_UNICODE, $charset); +} + +############################################################################# +# INTERNAL ROUTINES + +####################################################################### +# print_conversion_tables_direction - write the whole content of C source of radix tree +# +# print_conversion_tables_direction($this_script, $csname, $direction, \%charset, $tblwidth) +# +# this_script - the name of the *caller script* of this feature +# csname - character set name other than ucs +# direction - desired direction, TO_UNICODE or FROM_UNICODE +# charset - ref to character set array # -sub print_tables +sub print_conversion_tables_direction { - my ($charset, $table, $verbose) = @_; + my ($this_script, $csname, $direction, $charset) = @_; - # Build an array with only the to-UTF8 direction mappings - my @to_unicode; - my @to_unicode_combined; - my @from_unicode; - my @from_unicode_combined; + my $fname; + my $tblname; + if ($direction == TO_UNICODE) + { + $fname = lc("${csname}_to_utf8.map"); + $tblname = lc("${csname}_to_unicode_tree"); - foreach my $i (@$table) + print "- Writing ${csname}=>UTF8 conversion table: $fname\n"; + } + else { - if (defined $i->{ucs_second}) + $fname = lc("utf8_to_${csname}.map"); + $tblname = lc("${csname}_from_unicode_tree"); + + print "- Writing UTF8=>${csname} conversion table: $fname\n"; + } + + open(my $out, '>', $fname) || die("cannot open $fname"); + + print $out "/* src/backend/utils/mb/Unicode/$fname */\n"; + print $out "/* This file is generated by $this_script */\n\n"; + + # Collect regular, non-combined, mappings, and create the radix tree from them. + my $charmap = &make_charmap($out, $charset, $direction, 0); + print_radix_table($out, $tblname, $charmap); + + # Collect combined characters, and create combined character table (if any) + my $charmap_combined = &make_charmap_combined($charset, $direction); + + if (scalar @{$charmap_combined} > 0) + { + if ($direction == TO_UNICODE) { - my $entry = {utf8 => ucs2utf($i->{ucs}), - utf8_second => ucs2utf($i->{ucs_second}), - code => $i->{code}, - comment => $i->{comment}, - f => $i->{f}, l => $i->{l}}; - if ($i->{direction} eq "both" || $i->{direction} eq "to_unicode") - { - push @to_unicode_combined, $entry; - } - if ($i->{direction} eq "both" || $i->{direction} eq "from_unicode") - { - push @from_unicode_combined, $entry; - } + print_to_utf8_combined_map($out, $csname, + $charmap_combined, 1); } else { - my $entry = {utf8 => ucs2utf($i->{ucs}), - code => $i->{code}, - comment => $i->{comment}, - f => $i->{f}, l => $i->{l}}; - if ($i->{direction} eq "both" || $i->{direction} eq "to_unicode") - { - push @to_unicode, $entry; - } - if ($i->{direction} eq "both" || $i->{direction} eq "from_unicode") - { - push @from_unicode, $entry; - } + print_from_utf8_combined_map($out, $csname, + $charmap_combined, 1); } } - print_to_utf8_map($charset, \@to_unicode, $verbose); - print_to_utf8_combined_map($charset, \@to_unicode_combined, $verbose) if (scalar @to_unicode_combined > 0); - print_from_utf8_map($charset, \@from_unicode, $verbose); - print_from_utf8_combined_map($charset, \@from_unicode_combined, $verbose) if (scalar @from_unicode_combined > 0); + close($out); } -sub print_from_utf8_map +sub print_from_utf8_combined_map { - my ($charset, $table, $verbose) = @_; + my ($out, $charset, $table, $verbose) = @_; my $last_comment = ""; - my $fname = lc("utf8_to_${charset}.map"); - print "- Writing UTF8=>${charset} conversion table: $fname\n"; - open(my $out, '>', $fname) || die "cannot open output file : $fname\n"; - printf($out "/* src/backend/utils/mb/Unicode/$fname */\n\n". - "static const pg_utf_to_local ULmap${charset}[ %d ] = {", - scalar(@$table)); + printf $out "\n/* Combined character map */\n"; + printf $out "static const pg_utf_to_local_combined ULmap${charset}_combined[ %d ] = {", + scalar(@$table); my $first = 1; foreach my $i (sort {$a->{utf8} <=> $b->{utf8}} @$table) { print($out ",") if (!$first); $first = 0; - print($out "\t/* $last_comment */") if ($verbose); + print $out "\t/* $last_comment */" if ($verbose && $last_comment ne ""); - printf($out "\n {0x%04x, 0x%04x}", $i->{utf8}, $i->{code}); + printf $out "\n {0x%08x, 0x%08x, 0x%04x}", + $i->{utf8}, $i->{utf8_second}, $i->{code}; if ($verbose >= 2) { - $last_comment = "$i->{f}:$i->{l} $i->{comment}"; + $last_comment = + sprintf("%s:%d %s", $i->{f}, $i->{l}, $i->{comment}); } - else + elsif ($verbose >= 1) { $last_comment = $i->{comment}; } } - print($out "\t/* $last_comment */") if ($verbose); + print $out "\t/* $last_comment */" if ($verbose && $last_comment ne ""); print $out "\n};\n"; - close($out); } -sub print_from_utf8_combined_map +sub print_to_utf8_combined_map { - my ($charset, $table, $verbose) = @_; + my ($out, $charset, $table, $verbose) = @_; my $last_comment = ""; - my $fname = lc("utf8_to_${charset}_combined.map"); - print "- Writing UTF8=>${charset} conversion table: $fname\n"; - open(my $out, '>', $fname) || die "cannot open output file : $fname\n"; - printf($out "/* src/backend/utils/mb/Unicode/$fname */\n\n". - "static const pg_utf_to_local_combined ULmap${charset}_combined[ %d ] = {", - scalar(@$table)); + printf $out "\n/* Combined character map */\n"; + printf $out "static const pg_local_to_utf_combined LUmap${charset}_combined[ %d ] = {", + scalar(@$table); + my $first = 1; - foreach my $i (sort {$a->{utf8} <=> $b->{utf8}} @$table) + foreach my $i (sort {$a->{code} <=> $b->{code}} @$table) { print($out ",") if (!$first); $first = 0; - print($out "\t/* $last_comment */") if ($verbose); + print $out "\t/* $last_comment */" if ($verbose && $last_comment ne ""); + + printf $out "\n {0x%04x, 0x%08x, 0x%08x}", + $i->{code}, $i->{utf8}, $i->{utf8_second}; - printf($out "\n {0x%08x, 0x%08x, 0x%04x}", $i->{utf8}, $i->{utf8_second}, $i->{code}); - $last_comment = "$i->{comment}"; + if ($verbose >= 2) + { + $last_comment = + sprintf("%s:%d %s", $i->{f}, $i->{l}, $i->{comment}); + } + elsif ($verbose >= 1) + { + $last_comment = $i->{comment}; + } } - print($out "\t/* $last_comment */") if ($verbose); + print $out "\t/* $last_comment */" if ($verbose && $last_comment ne ""); print $out "\n};\n"; - close($out); } -sub print_to_utf8_map +####################################################################### +# print_radix_table(<output handle>, <table name>, <charmap hash ref>) +# +# Input: A hash, mapping an input character to an output character. +# +# Constructs a radix tree from the hash, and prints it out as a C-struct. +# +sub print_radix_table { - my ($charset, $table, $verbose) = @_; - - my $last_comment = ""; + my ($out, $tblname, $c) = @_; + + ### + ### Build radix trees in memory, for 1-, 2-, 3- and 4-byte inputs. Each + ### radix tree is represented as a nested hash, each hash indexed by + ### input byte + ### + my %b1map; + my %b2map; + my %b3map; + my %b4map; + foreach my $in (keys %$c) + { + my $out = $c->{$in}; - my $fname = lc("${charset}_to_utf8.map"); + if ($in < 0x100) + { + $b1map{$in} = $out; + } + elsif ($in < 0x10000) + { + my $b1 = $in >> 8; + my $b2 = $in & 0xff; - print "- Writing ${charset}=>UTF8 conversion table: $fname\n"; - open(my $out, '>', $fname) || die "cannot open output file : $fname\n"; - printf($out "/* src/backend/utils/mb/Unicode/${fname} */\n\n". - "static const pg_local_to_utf LUmap${charset}[ %d ] = {", - scalar(@$table)); - my $first = 1; - foreach my $i (sort {$a->{code} <=> $b->{code}} @$table) - { - print($out ",") if (!$first); - $first = 0; - print($out "\t/* $last_comment */") if ($verbose); + $b2map{$b1}{$b2} = $out; + } + elsif ($in < 0x1000000) + { + my $b1 = $in >> 16; + my $b2 = ($in >> 8) & 0xff; + my $b3 = $in & 0xff; - printf($out "\n {0x%04x, 0x%x}", $i->{code}, $i->{utf8}); - if ($verbose >= 2) + $b3map{$b1}{$b2}{$b3} = $out; + } + elsif ($in < 0x100000000) { - $last_comment = "$i->{f}:$i->{l} $i->{comment}"; + my $b1 = $in >> 24; + my $b2 = ($in >> 16) & 0xff; + my $b3 = ($in >> 8) & 0xff; + my $b4 = $in & 0xff; + + $b4map{$b1}{$b2}{$b3}{$b4} = $out; } else { - $last_comment = $i->{comment}; + die sprintf("up to 4 byte code is supported: %x", $in); } } - print($out "\t/* $last_comment */") if ($verbose); - print $out "\n};\n"; - close($out); + + my @segments; + + ### + ### Build a linear list of "segments", from the nested hashes. + ### + ### Each segment is a lookup table, keyed by the next byte in the input. + ### The segments are written out physically to one big array in the final + ### step, but logically, they form a radix tree. Or rather, four radix + ### trees: one for 1-byte inputs, another for 2-byte inputs, 3-byte + ### inputs, and 4-byte inputs. + ### + ### Each segment is represented by a hash with following fields: + ### + ### comment => <string to output as a comment> + ### label => <label that can be used to refer to this segment from elsewhere> + ### values => <a hash, keyed by byte, 0-0xff> + ### + ### Entries in 'values' can be integers (for leaf-level segments), or + ### string labels, pointing to a segment with that label. Any missing + ### values are treated as zeros. If 'values' hash is missing altogether, + ### it's treated as all-zeros. + ### + ### Subsequent steps will enrich the segments with more fields. + ### + + # Add the segments for the radix trees themselves. + push @segments, build_segments_from_tree("Single byte table", "1-byte", 1, \%b1map); + push @segments, build_segments_from_tree("Two byte table", "2-byte", 2, \%b2map); + push @segments, build_segments_from_tree("Three byte table", "3-byte", 3, \%b3map); + push @segments, build_segments_from_tree("Four byte table", "4-byte", 4, \%b4map); + + ### + ### Find min and max index used in each level of each tree. + ### + ### These are stored separately, and we can then leave out the unused + ### parts of every segment. (When using the resulting tree, you must + ### check each input byte against the min and max.) + ### + my %min_idx; + my %max_idx; + foreach my $seg (@segments) + { + my $this_min = $min_idx{$seg->{depth}}->{$seg->{level}}; + my $this_max = $max_idx{$seg->{depth}}->{$seg->{level}}; + + foreach my $i (keys %{$seg->{values}}) + { + $this_min = $i if (!defined $this_min || $i < $this_min); + $this_max = $i if (!defined $this_max || $i > $this_max); + } + + $min_idx{$seg->{depth}}{$seg->{level}} = $this_min; + $max_idx{$seg->{depth}}{$seg->{level}} = $this_max; + } + # Copy the mins and max's back to every segment, for convenience. + foreach my $seg (@segments) + { + $seg->{min_idx} = $min_idx{$seg->{depth}}{$seg->{level}}; + $seg->{max_idx} = $max_idx{$seg->{depth}}{$seg->{level}}; + } + + ### + ### Prepend a dummy all-zeros map to the beginning. + ### + ### A 0 is an invalid value anywhere in the table, and this allows us to + ### point to 0 offset from any table, to get a 0 result. + ### + + # Find the max range between min and max indexes in any of the segments. + my $widest_range = 0; + foreach my $seg (@segments) + { + my $this_range = $seg->{max_idx} - $seg->{min_idx}; + $widest_range = $this_range if ($this_range > $widest_range); + } + + unshift @segments, { + header => "Dummy map, for invalid values", + min_idx => 0, + max_idx => $widest_range + }; + + ### + ### Eliminate overlapping zeros + ### + ### For each segment, if there are zero values at the end of, and there + ### are also zero values at the beginning of the next segment, we can + ### overlay the tail of this segment with the head of next segment, to + ### save space. + ### + ### To achieve that, we subtract the 'max_idx' of each segment with the + ### amount of zeros that can be overlaid. + ### + for (my $j = 0; $j < $#segments - 1; $j++) + { + my $seg = $segments[$j]; + my $nextseg = $segments[$j + 1]; + + # Count the number of zero values at the end of this segment. + my $this_trail_zeros = 0; + for (my $i = $seg->{max_idx}; $i >= $seg->{min_idx} && !$seg->{values}->{$i}; $i--) + { + $this_trail_zeros++; + } + + # Count the number of zeros at the beginning of next segment. + my $next_lead_zeros = 0; + for (my $i = $nextseg->{min_idx}; $i <= $nextseg->{max_idx} && !$nextseg->{values}->{$i}; $i++) + { + $next_lead_zeros++; + } + + # How many zeros in common? + my $overlaid_trail_zeros = + ($this_trail_zeros > $next_lead_zeros) ? $next_lead_zeros : $this_trail_zeros; + + $seg->{overlaid_trail_zeros} = $overlaid_trail_zeros; + $seg->{max_idx} = $seg->{max_idx} - $overlaid_trail_zeros; + } + + ### + ### Replace label references with real offsets. + ### + ### So far, the non-leaf segments have referred to other segments by + ### their labels. Replace them with numerical offsets from the beginning + ### of the final array. You cannot move, add, or remove segments after + ### this step, as that would invalidate the offsets calculated here! + ### + my $flatoff = 0; + my %segmap; + + # First pass: assign offsets to each segment, and build hash + # of label => offset. + foreach my $seg (@segments) + { + $seg->{offset} = $flatoff; + $segmap{$seg->{label}} = $flatoff; + $flatoff += $seg->{max_idx} - $seg->{min_idx} + 1; + } + my $tblsize = $flatoff; + + # Second pass: look up the offset of each label reference in the hash. + foreach my $seg (@segments) + { + while (my ($i, $val) = each %{$seg->{values}}) + { + if (!($val =~ /^[0-9,.E]+$/ )) + { + my $segoff = $segmap{$val}; + if ($segoff) + { + $seg->{values}->{$i} = $segoff; + } + else + { + die "no segment with label $val"; + } + } + } + } + + # Also look up the positions of the roots in the table. + my $b1root = $segmap{"1-byte"}; + my $b2root = $segmap{"2-byte"}; + my $b3root = $segmap{"3-byte"}; + my $b4root = $segmap{"4-byte"}; + + # And the lower-upper values of each level in each radix tree. + my $b1_lower = $min_idx{1}{1}; + my $b1_upper = $max_idx{1}{1}; + + my $b2_1_lower = $min_idx{2}{1}; + my $b2_1_upper = $max_idx{2}{1}; + my $b2_2_lower = $min_idx{2}{2}; + my $b2_2_upper = $max_idx{2}{2}; + + my $b3_1_lower = $min_idx{3}{1}; + my $b3_1_upper = $max_idx{3}{1}; + my $b3_2_lower = $min_idx{3}{2}; + my $b3_2_upper = $max_idx{3}{2}; + my $b3_3_lower = $min_idx{3}{3}; + my $b3_3_upper = $max_idx{3}{3}; + + my $b4_1_lower = $min_idx{4}{1}; + my $b4_1_upper = $max_idx{4}{1}; + my $b4_2_lower = $min_idx{4}{2}; + my $b4_2_upper = $max_idx{4}{2}; + my $b4_3_lower = $min_idx{4}{3}; + my $b4_3_upper = $max_idx{4}{3}; + my $b4_4_lower = $min_idx{4}{4}; + my $b4_4_upper = $max_idx{4}{4}; + + ### + ### Find the maximum value in the whole table, to determine if we can + ### use uint16 or if we need to use uint32. + ### + my $max_val = 0; + foreach my $seg (@segments) + { + foreach my $val (values %{$seg->{values}}) + { + $max_val = $val if ($val > $max_val); + } + } + + my $datatype = ($max_val <= 0xffff) ? "uint16" : "uint32"; + + # For formatting, determine how many values we can fit on a single + # line, and how wide each value needs to be to align nicely. + my $vals_per_line; + my $colwidth; + + if ($max_val <= 0xffff) + { + $vals_per_line = 8; + $colwidth = 4; + } + elsif ($max_val <= 0xffffff) + { + $vals_per_line = 4; + $colwidth = 6; + } + else + { + $vals_per_line = 4; + $colwidth = 8; + } + + ### + ### Print the struct and array. + ### + printf $out "static const $datatype ${tblname}_table[];\n"; + printf $out "\n"; + printf $out "static const pg_mb_radix_tree $tblname =\n"; + printf $out "{\n"; + if ($datatype eq "uint16") + { + print $out " ${tblname}_table,\n"; + print $out " NULL, /* 32-bit table not used */\n"; + } + if ($datatype eq "uint32") + { + print $out " NULL, /* 16-bit table not used */\n"; + print $out " ${tblname}_table,\n"; + } + printf $out "\n"; + printf $out " 0x%04x, /* offset of table for 1-byte inputs */\n", $b1root; + printf $out " 0x%02x, /* b1_lower */\n", $b1_lower; + printf $out " 0x%02x, /* b1_upper */\n", $b1_upper; + printf $out "\n"; + printf $out " 0x%04x, /* offset of table for 2-byte inputs */\n", $b2root; + printf $out " 0x%02x, /* b2_1_lower */\n", $b2_1_lower; + printf $out " 0x%02x, /* b2_1_upper */\n", $b2_1_upper; + printf $out " 0x%02x, /* b2_2_lower */\n", $b2_2_lower; + printf $out " 0x%02x, /* b2_2_upper */\n", $b2_2_upper; + printf $out "\n"; + printf $out " 0x%04x, /* offset of table for 3-byte inputs */\n", $b3root; + printf $out " 0x%02x, /* b3_1_lower */\n", $b3_1_lower; + printf $out " 0x%02x, /* b3_1_upper */\n", $b3_1_upper; + printf $out " 0x%02x, /* b3_2_lower */\n", $b3_2_lower; + printf $out " 0x%02x, /* b3_2_upper */\n", $b3_2_upper; + printf $out " 0x%02x, /* b3_3_lower */\n", $b3_3_lower; + printf $out " 0x%02x, /* b3_3_upper */\n", $b3_3_upper; + printf $out "\n"; + printf $out " 0x%04x, /* offset of table for 3-byte inputs */\n", $b4root; + printf $out " 0x%02x, /* b4_1_lower */\n", $b4_1_lower; + printf $out " 0x%02x, /* b4_1_upper */\n", $b4_1_upper; + printf $out " 0x%02x, /* b4_2_lower */\n", $b4_2_lower; + printf $out " 0x%02x, /* b4_2_upper */\n", $b4_2_upper; + printf $out " 0x%02x, /* b4_3_lower */\n", $b4_3_lower; + printf $out " 0x%02x, /* b4_3_upper */\n", $b4_3_upper; + printf $out " 0x%02x, /* b4_4_lower */\n", $b4_4_lower; + printf $out " 0x%02x /* b4_4_upper */\n", $b4_4_upper; + print $out "};\n"; + print $out "\n"; + print $out "static const $datatype ${tblname}_table[$tblsize] =\n"; + print $out "{"; + my $off = 0; + foreach my $seg (@segments) + { + printf $out "\n"; + printf $out " /*** %s - offset 0x%05x ***/\n", $seg->{header}, $off; + printf $out "\n"; + + for (my $i=$seg->{min_idx}; $i <= $seg->{max_idx};) + { + # Print the next line's worth of values. + # XXX pad to begin at a nice boundary + printf $out " /* %02x */ ", $i; + for (my $j = 0; $j < $vals_per_line && $i <= $seg->{max_idx}; $j++) + { + my $val = $seg->{values}->{$i}; + + printf $out " 0x%0*x", $colwidth, $val; + $off++; + if ($off != $tblsize) + { + print $out ","; + } + $i++; + } + print $out "\n"; + } + if ($seg->{overlaid_trail_zeros}) + { + printf $out " /* $seg->{overlaid_trail_zeros} trailing zero values shared with next segment */\n"; + } + } + + # Sanity check. + if ($off != $tblsize) { die "table size didn't match!"; } + + print $out "};\n"; } -sub print_to_utf8_combined_map +### +sub build_segments_from_tree { - my ($charset, $table, $verbose) = @_; + my ($header, $rootlabel, $depth, $map) = @_; - my $last_comment = ""; + my @segments; - my $fname = lc("${charset}_to_utf8_combined.map"); + if (%{$map}) + { + @segments = build_segments_recurse($header, $rootlabel, "", 1, $depth, $map); - print "- Writing ${charset}=>UTF8 conversion table: $fname\n"; - open(my $out, '>', $fname) || die "cannot open output file : $fname\n"; - printf($out "/* src/backend/utils/mb/Unicode/${fname} */\n\n". - "static const pg_local_to_utf_combined LUmap${charset}_combined[ %d ] = {", - scalar(@$table)); - my $first = 1; - foreach my $i (sort {$a->{code} <=> $b->{code}} @$table) - { - print($out ",") if (!$first); - $first = 0; - print($out "\t/* $last_comment */") if ($verbose); + # Sort the segments into "breadth-first" order. Not strictly required, + # but makes the maps nicer to read. + @segments = sort { $a->{level} cmp $b->{level} or + $a->{path} cmp $b->{path}} + @segments; + } + + return @segments; +} - printf($out "\n {0x%04x, 0x%08x, 0x%08x}", $i->{code}, $i->{utf8}, $i->{utf8_second}); - $last_comment = "$i->{comment}"; +### +sub build_segments_recurse +{ + my ($header, $label, $path, $level, $depth, $map) = @_; + + my @segments; + + if ($level == $depth) + { + push @segments, { + header => $header . ", leaf: ${path}xx", + label => $label, + level => $level, + depth => $depth, + path => $path, + values => $map + }; } - print($out "\t/* $last_comment */") if ($verbose); - print $out "\n};\n"; - close($out); + else + { + my %children; + + while (my ($i, $val) = each %$map) + { + my $childpath = $path . sprintf("%02x", $i); + my $childlabel = "$depth-level-$level-$childpath"; + + push @segments, build_segments_recurse($header, $childlabel, $childpath, + $level + 1, $depth, $val); + $children{$i} = $childlabel; + } + + push @segments, { + header => $header . ", byte #$level: ${path}xx", + label => $label, + level => $level, + depth => $depth, + path => $path, + values => \%children + }; + } + return @segments; +} + +####################################################################### +# make_charmap - convert charset table to charmap hash +# +# make_charmap(\@charset, $direction) +# charset - ref to charset table : see print_conversion_tables +# direction - conversion direction +# +sub make_charmap +{ + my ($out, $charset, $direction, $verbose) = @_; + + die "unacceptable direction : $direction" + if ($direction != TO_UNICODE && $direction != FROM_UNICODE); + + # In verbose mode, print a large comment with the source and comment of + # each character + if ($verbose) + { + print $out "/*\n"; + print $out "<src> <dst> <file>:<lineno> <comment>\n"; + } + + my %charmap; + foreach my $c (@$charset) + { + # combined characters are handled elsewhere + next if (defined $c->{ucs_second}); + + next if ($c->{direction} != $direction && $c->{direction} != BOTH); + + my ($src, $dst) = + $direction == TO_UNICODE + ? ($c->{code}, ucs2utf($c->{ucs})) + : (ucs2utf($c->{ucs}), $c->{code}); + + # check for duplicate source codes + if (defined $charmap{$src}) + { + printf STDERR + "Error: duplicate source code on %s:%d: 0x%04x => 0x%04x, 0x%04x\n", + $c->{f}, $c->{l}, $src, $charmap{$src}, $dst; + exit; + } + $charmap{$src} = $dst; + + if ($verbose) + { + printf $out "0x%04x 0x%04x %s:%d %s\n", $src, $dst, $c->{f}, $c->{l}, $c->{comment}; + } + } + if ($verbose) + { + print $out "*/\n\n"; + } + + return \%charmap; +} + +####################################################################### +# make_charmap_combined - convert charset table to charmap hash +# with checking duplicate source code +# +# make_charmap_combined(\@charset, $direction) +# charset - ref to charset table : see print_conversion_tables +# direction - conversion direction +# +sub make_charmap_combined +{ + my ($charset, $direction) = @_; + + die "unacceptable direction : $direction" + if ($direction != TO_UNICODE && $direction != FROM_UNICODE); + + my @combined; + foreach my $c (@$charset) + { + next if ($c->{direction} != $direction && $c->{direction} != BOTH); + + if (defined $c->{ucs_second}) + { + my $entry = {utf8 => ucs2utf($c->{ucs}), + utf8_second => ucs2utf($c->{ucs_second}), + code => $c->{code}, + comment => $c->{comment}, + f => $c->{f}, l => $c->{l}}; + push @combined, $entry; + } + } + + return \@combined; +} + +####################################################################### +# convert UCS-4 to UTF-8 +# +sub ucs2utf +{ + my ($ucs) = @_; + my $utf; + + if ($ucs <= 0x007f) + { + $utf = $ucs; + } + elsif ($ucs > 0x007f && $ucs <= 0x07ff) + { + $utf = (($ucs & 0x003f) | 0x80) | ((($ucs >> 6) | 0xc0) << 8); + } + elsif ($ucs > 0x07ff && $ucs <= 0xffff) + { + $utf = + ((($ucs >> 12) | 0xe0) << 16) | + (((($ucs & 0x0fc0) >> 6) | 0x80) << 8) | (($ucs & 0x003f) | 0x80); + } + else + { + $utf = + ((($ucs >> 18) | 0xf0) << 24) | + (((($ucs & 0x3ffff) >> 12) | 0x80) << 16) | + (((($ucs & 0x0fc0) >> 6) | 0x80) << 8) | (($ucs & 0x003f) | 0x80); + } + return ($utf); } 1; |