1 files changed, 656 insertions, 150 deletions
diff --git a/src/backend/utils/mb/Unicode/convutils.pm b/src/backend/utils/mb/Unicode/convutils.pm
index eb3c602c32d..6bd84712b05 100644
--- a/src/backend/utils/mb/Unicode/convutils.pm
+++ b/src/backend/utils/mb/Unicode/convutils.pm
@@ -3,44 +3,27 @@
 #
 # src/backend/utils/mb/Unicode/convutils.pm
 
+package convutils;
+
 use strict;
 
-#######################################################################
-# convert UCS-4 to UTF-8
-#
-sub ucs2utf
-{
-	my ($ucs) = @_;
-	my $utf;
+use Exporter 'import';
 
-	if ($ucs <= 0x007f)
-	{
-		$utf = $ucs;
-	}
-	elsif ($ucs > 0x007f && $ucs <= 0x07ff)
-	{
-		$utf = (($ucs & 0x003f) | 0x80) | ((($ucs >> 6) | 0xc0) << 8);
-	}
-	elsif ($ucs > 0x07ff && $ucs <= 0xffff)
-	{
-		$utf =
-		  ((($ucs >> 12) | 0xe0) << 16) |
-		  (((($ucs & 0x0fc0) >> 6) | 0x80) << 8) | (($ucs & 0x003f) | 0x80);
-	}
-	else
-	{
-		$utf =
-		  ((($ucs >> 18) | 0xf0) << 24) |
-		  (((($ucs & 0x3ffff) >> 12) | 0x80) << 16) |
-		  (((($ucs & 0x0fc0) >> 6) | 0x80) << 8) | (($ucs & 0x003f) | 0x80);
-	}
-	return ($utf);
-}
+our @EXPORT = qw( NONE TO_UNICODE FROM_UNICODE BOTH read_source print_conversion_tables);
+
+# Constants used in the 'direction' field of the character maps
+use constant {
+	NONE         => 0,
+	TO_UNICODE   => 1,
+	FROM_UNICODE => 2,
+	BOTH         => 3
+};
 
 #######################################################################
 # read_source - common routine to read source file
 #
 # fname ; input file name
+#
 sub read_source
 {
 	my ($fname) = @_;
@@ -70,7 +53,9 @@ sub read_source
 				   code => hex($1),
 				   ucs => hex($2),
 				   comment => $4,
-				   direction => "both"
+				   direction => BOTH,
+				   f => $fname,
+				   l => $.
 				};
 
 		# Ignore pure ASCII mappings. PostgreSQL character conversion code
@@ -85,20 +70,18 @@ sub read_source
 }
 
 ##################################################################
-# print_tables : output mapping tables
+# print_conversion_tables - output mapping tables
 #
-# Arguments:
-#  charset - string name of the character set.
-#  table   - mapping table (see format below)
-#  verbose - if 1, output comment on each line,
-#            if 2, also output source file name and number
+# print_conversion_tables($this_script, $csname, \%charset)
 #
+# this_script - the name of the *caller script* of this feature
+# csname      - character set name other than ucs
+# charset     - ref to character set array
 #
+# Input character set array format:
 #
-# Mapping table format:
-#
-# Mapping table is a list of hashes. Each hash has the following fields:
-#   direction  - Direction: 'both', 'from_unicode' or 'to_unicode'
+# Each element in the character set array is a hash. Each hash has the following fields:
+#   direction  - BOTH, TO_UNICODE, or FROM_UNICODE (or NONE, to ignore the entry altogether)
 #   ucs        - Unicode code point
 #   ucs_second - Second Unicode code point, if this is a "combined" character.
 #   code       - Byte sequence in the "other" character set, as an integer
@@ -106,180 +89,703 @@ sub read_source
 #   f          - Source filename
 #   l          - Line number in source file
 #
+sub print_conversion_tables
+{
+	my ($this_script, $csname, $charset) = @_;
+
+	print_conversion_tables_direction($this_script, $csname, FROM_UNICODE, $charset);
+	print_conversion_tables_direction($this_script, $csname, TO_UNICODE,   $charset);
+}
+
+#############################################################################
+# INTERNAL ROUTINES
+
+#######################################################################
+# print_conversion_tables_direction - write the whole content of C source of radix tree
+#
+# print_conversion_tables_direction($this_script, $csname, $direction, \%charset, $tblwidth)
+#
+# this_script - the name of the *caller script* of this feature
+# csname      - character set name other than ucs
+# direction   - desired direction, TO_UNICODE or FROM_UNICODE
+# charset     - ref to character set array
 #
-sub print_tables
+sub print_conversion_tables_direction
 {
-	my ($charset, $table, $verbose) = @_;
+	my ($this_script, $csname, $direction, $charset) = @_;
 
-	# Build an array with only the to-UTF8 direction mappings
-	my @to_unicode;
-	my @to_unicode_combined;
-	my @from_unicode;
-	my @from_unicode_combined;
+	my $fname;
+	my $tblname;
+	if ($direction == TO_UNICODE)
+	{
+		$fname = lc("${csname}_to_utf8.map");
+		$tblname = lc("${csname}_to_unicode_tree");
 
-	foreach my $i (@$table)
+		print "- Writing ${csname}=>UTF8 conversion table: $fname\n";
+	}
+	else
 	{
-		if (defined $i->{ucs_second})
+		$fname = lc("utf8_to_${csname}.map");
+		$tblname = lc("${csname}_from_unicode_tree");
+
+		print "- Writing UTF8=>${csname} conversion table: $fname\n";
+	}
+
+	open(my $out, '>', $fname) || die("cannot open $fname");
+
+	print $out "/* src/backend/utils/mb/Unicode/$fname */\n";
+	print $out "/* This file is generated by $this_script */\n\n";
+
+	# Collect regular, non-combined, mappings, and create the radix tree from them.
+	my $charmap = &make_charmap($out, $charset, $direction, 0);
+	print_radix_table($out, $tblname, $charmap);
+
+	# Collect combined characters, and create combined character table (if any)
+	my $charmap_combined = &make_charmap_combined($charset, $direction);
+
+	if (scalar @{$charmap_combined} > 0)
+	{
+		if ($direction == TO_UNICODE)
 		{
-			my $entry = {utf8 => ucs2utf($i->{ucs}),
-						 utf8_second => ucs2utf($i->{ucs_second}),
-						 code => $i->{code},
-						 comment => $i->{comment},
-						 f => $i->{f}, l => $i->{l}};
-			if ($i->{direction} eq "both" || $i->{direction} eq "to_unicode")
-			{
-				push @to_unicode_combined, $entry;
-			}
-			if ($i->{direction} eq "both" || $i->{direction} eq "from_unicode")
-			{
-				push @from_unicode_combined, $entry;
-			}
+			print_to_utf8_combined_map($out, $csname,
+									   $charmap_combined, 1);
 		}
 		else
 		{
-			my $entry = {utf8 => ucs2utf($i->{ucs}),
-						 code => $i->{code},
-						 comment => $i->{comment},
-						 f => $i->{f}, l => $i->{l}};
-			if ($i->{direction} eq "both" || $i->{direction} eq "to_unicode")
-			{
-				push @to_unicode, $entry;
-			}
-			if ($i->{direction} eq "both" || $i->{direction} eq "from_unicode")
-			{
-				push @from_unicode, $entry;
-			}
+			print_from_utf8_combined_map($out, $csname,
+										 $charmap_combined, 1);
 		}
 	}
 
-	print_to_utf8_map($charset, \@to_unicode, $verbose);
-	print_to_utf8_combined_map($charset, \@to_unicode_combined, $verbose) if (scalar @to_unicode_combined > 0);
-	print_from_utf8_map($charset, \@from_unicode, $verbose);
-	print_from_utf8_combined_map($charset, \@from_unicode_combined, $verbose) if (scalar @from_unicode_combined > 0);
+	close($out);
 }
 
-sub print_from_utf8_map
+sub print_from_utf8_combined_map
 {
-	my ($charset, $table, $verbose) = @_;
+	my ($out, $charset, $table, $verbose) = @_;
 
 	my $last_comment = "";
 
-	my $fname = lc("utf8_to_${charset}.map");
-	print "- Writing UTF8=>${charset} conversion table: $fname\n";
-	open(my $out, '>', $fname) || die "cannot open output file : $fname\n";
-	printf($out "/* src/backend/utils/mb/Unicode/$fname */\n\n".
-		   "static const pg_utf_to_local ULmap${charset}[ %d ] = {",
-		   scalar(@$table));
+	printf $out "\n/* Combined character map */\n";
+	printf $out "static const pg_utf_to_local_combined ULmap${charset}_combined[ %d ] = {",
+	  scalar(@$table);
 	my $first = 1;
 	foreach my $i (sort {$a->{utf8} <=> $b->{utf8}} @$table)
     {
 		print($out ",") if (!$first);
 		$first = 0;
-		print($out "\t/* $last_comment */") if ($verbose);
+		print $out "\t/* $last_comment */" if ($verbose && $last_comment ne "");
 
-		printf($out "\n  {0x%04x, 0x%04x}", $i->{utf8}, $i->{code});
+		printf $out "\n  {0x%08x, 0x%08x, 0x%04x}",
+		  $i->{utf8}, $i->{utf8_second}, $i->{code};
 		if ($verbose >= 2)
 		{
-			$last_comment = "$i->{f}:$i->{l} $i->{comment}";
+			$last_comment =
+			  sprintf("%s:%d %s", $i->{f}, $i->{l}, $i->{comment});
 		}
-		else
+		elsif ($verbose >= 1)
 		{
 			$last_comment = $i->{comment};
 		}
 	}
-	print($out "\t/* $last_comment */") if ($verbose);
+	print $out "\t/* $last_comment */" if ($verbose && $last_comment ne "");
 	print $out "\n};\n";
-	close($out);
 }
 
-sub print_from_utf8_combined_map
+sub print_to_utf8_combined_map
 {
-	my ($charset, $table, $verbose) = @_;
+	my ($out, $charset, $table, $verbose) = @_;
 
 	my $last_comment = "";
 
-	my $fname = lc("utf8_to_${charset}_combined.map");
-	print "- Writing UTF8=>${charset} conversion table: $fname\n";
-	open(my $out, '>', $fname) || die "cannot open output file : $fname\n";
-	printf($out "/* src/backend/utils/mb/Unicode/$fname */\n\n".
-		   "static const pg_utf_to_local_combined ULmap${charset}_combined[ %d ] = {",
-		   scalar(@$table));
+	printf $out "\n/* Combined character map */\n";
+	printf $out "static const pg_local_to_utf_combined LUmap${charset}_combined[ %d ] = {",
+	  scalar(@$table);
+
 	my $first = 1;
-	foreach my $i (sort {$a->{utf8} <=> $b->{utf8}} @$table)
+	foreach my $i (sort {$a->{code} <=> $b->{code}} @$table)
     {
 		print($out ",") if (!$first);
 		$first = 0;
-		print($out "\t/* $last_comment */") if ($verbose);
+		print $out "\t/* $last_comment */" if ($verbose && $last_comment ne "");
+
+		printf $out "\n  {0x%04x, 0x%08x, 0x%08x}",
+		  $i->{code}, $i->{utf8}, $i->{utf8_second};
 
-		printf($out "\n  {0x%08x, 0x%08x, 0x%04x}", $i->{utf8}, $i->{utf8_second}, $i->{code});
-		$last_comment = "$i->{comment}";
+		if ($verbose >= 2)
+		{
+			$last_comment =
+				sprintf("%s:%d %s", $i->{f}, $i->{l}, $i->{comment});
+		}
+		elsif ($verbose >= 1)
+		{
+			$last_comment = $i->{comment};
+		}
 	}
-	print($out "\t/* $last_comment */") if ($verbose);
+	print $out "\t/* $last_comment */" if ($verbose && $last_comment ne "");
 	print $out "\n};\n";
-	close($out);
 }
 
-sub print_to_utf8_map
+#######################################################################
+# print_radix_table(<output handle>, <table name>, <charmap hash ref>)
+#
+# Input: A hash, mapping an input character to an output character.
+#
+# Constructs a radix tree from the hash, and prints it out as a C-struct.
+#
+sub print_radix_table
 {
-	my ($charset, $table, $verbose) = @_;
-
-	my $last_comment = "";
+	my ($out, $tblname, $c) = @_;
+
+	###
+	### Build radix trees in memory, for 1-, 2-, 3- and 4-byte inputs. Each
+	### radix tree is represented as a nested hash, each hash indexed by
+	### input byte
+	###
+	my %b1map;
+	my %b2map;
+	my %b3map;
+	my %b4map;
+	foreach my $in (keys %$c)
+	{
+		my $out = $c->{$in};
 
-	my $fname = lc("${charset}_to_utf8.map");
+		if ($in < 0x100)
+		{
+			$b1map{$in} = $out;
+		}
+		elsif ($in < 0x10000)
+		{
+			my $b1     = $in >> 8;
+			my $b2     = $in & 0xff;
 
-	print "- Writing ${charset}=>UTF8 conversion table: $fname\n";
-	open(my $out, '>', $fname) || die "cannot open output file : $fname\n";
-	printf($out "/* src/backend/utils/mb/Unicode/${fname} */\n\n".
-		   "static const pg_local_to_utf LUmap${charset}[ %d ] = {",
-		   scalar(@$table));
-	my $first = 1;
-	foreach my $i (sort {$a->{code} <=> $b->{code}} @$table)
-    {
-		print($out ",") if (!$first);
-		$first = 0;
-		print($out "\t/* $last_comment */") if ($verbose);
+			$b2map{$b1}{$b2} = $out;
+		}
+		elsif ($in < 0x1000000)
+		{
+			my $b1     = $in >> 16;
+			my $b2     = ($in >> 8) & 0xff;
+			my $b3     = $in & 0xff;
 
-		printf($out "\n  {0x%04x, 0x%x}", $i->{code}, $i->{utf8});
-		if ($verbose >= 2)
+			$b3map{$b1}{$b2}{$b3} = $out;
+		}
+		elsif ($in < 0x100000000)
 		{
-			$last_comment = "$i->{f}:$i->{l} $i->{comment}";
+			my $b1     = $in >> 24;
+			my $b2     = ($in >> 16) & 0xff;
+			my $b3     = ($in >> 8) & 0xff;
+			my $b4     = $in & 0xff;
+
+			$b4map{$b1}{$b2}{$b3}{$b4} = $out;
 		}
 		else
 		{
-			$last_comment = $i->{comment};
+			die sprintf("up to 4 byte code is supported: %x", $in);
 		}
 	}
-	print($out "\t/* $last_comment */") if ($verbose);
-	print $out "\n};\n";
-	close($out);
+
+	my @segments;
+
+	###
+	### Build a linear list of "segments", from the nested hashes.
+	###
+	### Each segment is a lookup table, keyed by the next byte in the input.
+	### The segments are written out physically to one big array in the final
+	### step, but logically, they form a radix tree. Or rather, four radix
+	### trees: one for 1-byte inputs, another for 2-byte inputs, 3-byte
+	### inputs, and 4-byte inputs.
+	###
+	### Each segment is represented by a hash with following fields:
+	###
+	### comment => <string to output as a comment>
+	### label => <label that can be used to refer to this segment from elsewhere>
+	### values => <a hash, keyed by byte, 0-0xff>
+	###
+	### Entries in 'values' can be integers (for leaf-level segments), or
+	### string labels, pointing to a segment with that label. Any missing
+	### values are treated as zeros. If 'values' hash is missing altogether,
+	### it's treated as all-zeros.
+	###
+	### Subsequent steps will enrich the segments with more fields.
+	###
+
+	# Add the segments for the radix trees themselves.
+	push @segments, build_segments_from_tree("Single byte table", "1-byte", 1, \%b1map);
+	push @segments, build_segments_from_tree("Two byte table", "2-byte", 2, \%b2map);
+	push @segments, build_segments_from_tree("Three byte table", "3-byte", 3, \%b3map);
+	push @segments, build_segments_from_tree("Four byte table", "4-byte", 4, \%b4map);
+
+	###
+	### Find min and max index used in each level of each tree.
+	###
+	### These are stored separately, and we can then leave out the unused
+	### parts of every segment. (When using the resulting tree, you must
+	### check each input byte against the min and max.)
+	###
+	my %min_idx;
+	my %max_idx;
+	foreach my $seg (@segments)
+	{
+		my $this_min = $min_idx{$seg->{depth}}->{$seg->{level}};
+		my $this_max = $max_idx{$seg->{depth}}->{$seg->{level}};
+
+		foreach my $i (keys %{$seg->{values}})
+		{
+			$this_min = $i if (!defined $this_min || $i < $this_min);
+			$this_max = $i if (!defined $this_max || $i > $this_max);
+		}
+
+		$min_idx{$seg->{depth}}{$seg->{level}} = $this_min;
+		$max_idx{$seg->{depth}}{$seg->{level}} = $this_max;
+	}
+	# Copy the mins and max's back to every segment, for convenience.
+	foreach my $seg (@segments)
+	{
+		$seg->{min_idx} = $min_idx{$seg->{depth}}{$seg->{level}};
+		$seg->{max_idx} = $max_idx{$seg->{depth}}{$seg->{level}};
+	}
+
+	###
+	### Prepend a dummy all-zeros map to the beginning.
+	###
+	### A 0 is an invalid value anywhere in the table, and this allows us to
+	### point to 0 offset from any table, to get a 0 result.
+	###
+
+	# Find the max range between min and max indexes in any of the segments.
+	my $widest_range = 0;
+	foreach my $seg (@segments)
+	{
+		my $this_range = $seg->{max_idx} - $seg->{min_idx};
+		$widest_range = $this_range if ($this_range > $widest_range);
+	}
+
+	unshift @segments, {
+		header => "Dummy map, for invalid values",
+		min_idx => 0,
+		max_idx => $widest_range
+	};
+
+	###
+	### Eliminate overlapping zeros
+	###
+	### For each segment, if there are zero values at the end of, and there
+	### are also zero values at the beginning of the next segment, we can
+	### overlay the tail of this segment with the head of next segment, to
+	### save space.
+	###
+	### To achieve that, we subtract the 'max_idx' of each segment with the
+	### amount of zeros that can be overlaid.
+	###
+	for (my $j = 0; $j < $#segments - 1; $j++)
+	{
+		my $seg = $segments[$j];
+		my $nextseg = $segments[$j + 1];
+
+		# Count the number of zero values at the end of this segment.
+		my $this_trail_zeros = 0;
+		for (my $i = $seg->{max_idx}; $i >= $seg->{min_idx} && !$seg->{values}->{$i}; $i--)
+		{
+			$this_trail_zeros++;
+		}
+
+		# Count the number of zeros at the beginning of next segment.
+		my $next_lead_zeros = 0;
+		for (my $i = $nextseg->{min_idx}; $i <= $nextseg->{max_idx} && !$nextseg->{values}->{$i}; $i++)
+		{
+			$next_lead_zeros++;
+		}
+
+		# How many zeros in common?
+		my $overlaid_trail_zeros =
+			($this_trail_zeros > $next_lead_zeros) ? $next_lead_zeros : $this_trail_zeros;
+
+		$seg->{overlaid_trail_zeros} = $overlaid_trail_zeros;
+		$seg->{max_idx} = $seg->{max_idx} - $overlaid_trail_zeros;
+	}
+
+	###
+	### Replace label references with real offsets.
+	###
+	### So far, the non-leaf segments have referred to other segments by
+	### their labels. Replace them with numerical offsets from the beginning
+	### of the final array. You cannot move, add, or remove segments after
+	### this step, as that would invalidate the offsets calculated here!
+	###
+	my $flatoff = 0;
+	my %segmap;
+
+	# First pass: assign offsets to each segment, and build hash
+	# of label => offset.
+	foreach my $seg (@segments)
+	{
+		$seg->{offset} = $flatoff;
+		$segmap{$seg->{label}} = $flatoff;
+		$flatoff += $seg->{max_idx} - $seg->{min_idx} + 1;
+	}
+	my $tblsize = $flatoff;
+
+	# Second pass: look up the offset of each label reference in the hash.
+	foreach my $seg (@segments)
+	{
+		while (my ($i, $val) = each %{$seg->{values}})
+		{
+			if (!($val =~ /^[0-9,.E]+$/ ))
+			{
+				my $segoff = $segmap{$val};
+				if ($segoff)
+				{
+					$seg->{values}->{$i} = $segoff;
+				}
+				else
+				{
+					die "no segment with label $val";
+				}
+			}
+		}
+	}
+
+	# Also look up the positions of the roots in the table.
+	my $b1root = $segmap{"1-byte"};
+	my $b2root = $segmap{"2-byte"};
+	my $b3root = $segmap{"3-byte"};
+	my $b4root = $segmap{"4-byte"};
+
+	# And the lower-upper values of each level in each radix tree.
+	my $b1_lower = $min_idx{1}{1};
+	my $b1_upper = $max_idx{1}{1};
+
+	my $b2_1_lower = $min_idx{2}{1};
+	my $b2_1_upper = $max_idx{2}{1};
+	my $b2_2_lower = $min_idx{2}{2};
+	my $b2_2_upper = $max_idx{2}{2};
+
+	my $b3_1_lower = $min_idx{3}{1};
+	my $b3_1_upper = $max_idx{3}{1};
+	my $b3_2_lower = $min_idx{3}{2};
+	my $b3_2_upper = $max_idx{3}{2};
+	my $b3_3_lower = $min_idx{3}{3};
+	my $b3_3_upper = $max_idx{3}{3};
+
+	my $b4_1_lower = $min_idx{4}{1};
+	my $b4_1_upper = $max_idx{4}{1};
+	my $b4_2_lower = $min_idx{4}{2};
+	my $b4_2_upper = $max_idx{4}{2};
+	my $b4_3_lower = $min_idx{4}{3};
+	my $b4_3_upper = $max_idx{4}{3};
+	my $b4_4_lower = $min_idx{4}{4};
+	my $b4_4_upper = $max_idx{4}{4};
+
+	###
+	### Find the maximum value in the whole table, to determine if we can
+	### use uint16 or if we need to use uint32.
+	###
+	my $max_val = 0;
+	foreach my $seg (@segments)
+	{
+		foreach my $val (values %{$seg->{values}})
+		{
+			$max_val = $val if ($val > $max_val);
+		}
+	}
+
+	my $datatype = ($max_val <= 0xffff) ? "uint16" : "uint32";
+
+	# For formatting, determine how many values we can fit on a single
+	# line, and how wide each value needs to be to align nicely.
+	my $vals_per_line;
+	my $colwidth;
+
+	if ($max_val <= 0xffff)
+	{
+		$vals_per_line = 8;
+		$colwidth = 4;
+	}
+	elsif ($max_val <= 0xffffff)
+	{
+		$vals_per_line = 4;
+		$colwidth = 6;
+	}
+	else
+	{
+		$vals_per_line = 4;
+		$colwidth = 8;
+	}
+
+	###
+	### Print the struct and array.
+	###
+	printf $out "static const $datatype ${tblname}_table[];\n";
+	printf $out "\n";
+	printf $out "static const pg_mb_radix_tree $tblname =\n";
+	printf $out "{\n";
+	if ($datatype eq "uint16")
+	{
+		print $out "  ${tblname}_table,\n";
+		print $out "  NULL, /* 32-bit table not used */\n";
+	}
+	if ($datatype eq "uint32")
+	{
+		print $out "  NULL, /* 16-bit table not used */\n";
+		print $out "  ${tblname}_table,\n";
+	}
+	printf $out "\n";
+	printf $out "  0x%04x, /* offset of table for 1-byte inputs */\n", $b1root;
+	printf $out "  0x%02x, /* b1_lower */\n", $b1_lower;
+	printf $out "  0x%02x, /* b1_upper */\n", $b1_upper;
+	printf $out "\n";
+	printf $out "  0x%04x, /* offset of table for 2-byte inputs */\n", $b2root;
+	printf $out "  0x%02x, /* b2_1_lower */\n", $b2_1_lower;
+	printf $out "  0x%02x, /* b2_1_upper */\n", $b2_1_upper;
+	printf $out "  0x%02x, /* b2_2_lower */\n", $b2_2_lower;
+	printf $out "  0x%02x, /* b2_2_upper */\n", $b2_2_upper;
+	printf $out "\n";
+	printf $out "  0x%04x, /* offset of table for 3-byte inputs */\n", $b3root;
+	printf $out "  0x%02x, /* b3_1_lower */\n", $b3_1_lower;
+	printf $out "  0x%02x, /* b3_1_upper */\n", $b3_1_upper;
+	printf $out "  0x%02x, /* b3_2_lower */\n", $b3_2_lower;
+	printf $out "  0x%02x, /* b3_2_upper */\n", $b3_2_upper;
+	printf $out "  0x%02x, /* b3_3_lower */\n", $b3_3_lower;
+	printf $out "  0x%02x, /* b3_3_upper */\n", $b3_3_upper;
+	printf $out "\n";
+	printf $out "  0x%04x, /* offset of table for 3-byte inputs */\n", $b4root;
+	printf $out "  0x%02x, /* b4_1_lower */\n", $b4_1_lower;
+	printf $out "  0x%02x, /* b4_1_upper */\n", $b4_1_upper;
+	printf $out "  0x%02x, /* b4_2_lower */\n", $b4_2_lower;
+	printf $out "  0x%02x, /* b4_2_upper */\n", $b4_2_upper;
+	printf $out "  0x%02x, /* b4_3_lower */\n", $b4_3_lower;
+	printf $out "  0x%02x, /* b4_3_upper */\n", $b4_3_upper;
+	printf $out "  0x%02x, /* b4_4_lower */\n", $b4_4_lower;
+	printf $out "  0x%02x  /* b4_4_upper */\n", $b4_4_upper;
+	print $out "};\n";
+	print $out "\n";
+	print $out "static const $datatype ${tblname}_table[$tblsize] =\n";
+	print $out "{";
+	my $off = 0;
+	foreach my $seg (@segments)
+	{
+		printf $out "\n";
+		printf $out "  /*** %s - offset 0x%05x ***/\n", $seg->{header}, $off;
+		printf $out "\n";
+
+		for (my $i=$seg->{min_idx}; $i <= $seg->{max_idx};)
+		{
+			# Print the next line's worth of values.
+			# XXX pad to begin at a nice boundary
+			printf $out "  /* %02x */ ", $i;
+			for (my $j = 0; $j < $vals_per_line && $i <= $seg->{max_idx}; $j++)
+			{
+				my $val = $seg->{values}->{$i};
+
+				printf $out " 0x%0*x", $colwidth, $val;
+				$off++;
+				if ($off != $tblsize)
+				{
+					print $out ",";
+				}
+				$i++;
+			}
+			print $out "\n";
+		}
+		if ($seg->{overlaid_trail_zeros})
+		{
+			printf $out "    /* $seg->{overlaid_trail_zeros} trailing zero values shared with next segment */\n";
+		}
+	}
+
+	# Sanity check.
+	if ($off != $tblsize) { die "table size didn't match!"; }
+
+	print $out "};\n";
 }
 
-sub print_to_utf8_combined_map
+###
+sub build_segments_from_tree
 {
-	my ($charset, $table, $verbose) = @_;
+	my ($header, $rootlabel, $depth, $map) = @_;
 
-	my $last_comment = "";
+	my @segments;
 
-	my $fname = lc("${charset}_to_utf8_combined.map");
+	if (%{$map})
+	{
+		@segments = build_segments_recurse($header, $rootlabel, "", 1, $depth, $map);
 
-	print "- Writing ${charset}=>UTF8 conversion table: $fname\n";
-	open(my $out, '>', $fname) || die "cannot open output file : $fname\n";
-	printf($out "/* src/backend/utils/mb/Unicode/${fname} */\n\n".
-		   "static const pg_local_to_utf_combined LUmap${charset}_combined[ %d ] = {",
-		   scalar(@$table));
-	my $first = 1;
-	foreach my $i (sort {$a->{code} <=> $b->{code}} @$table)
-    {
-		print($out ",") if (!$first);
-		$first = 0;
-		print($out "\t/* $last_comment */") if ($verbose);
+		# Sort the segments into "breadth-first" order. Not strictly required,
+		# but makes the maps nicer to read.
+		@segments = sort { $a->{level} cmp $b->{level} or
+						   $a->{path}  cmp $b->{path}}
+						 @segments;
+	}
+
+	return @segments;
+}
 
-		printf($out "\n  {0x%04x, 0x%08x, 0x%08x}", $i->{code}, $i->{utf8}, $i->{utf8_second});
-		$last_comment = "$i->{comment}";
+###
+sub build_segments_recurse
+{
+	my ($header, $label, $path, $level, $depth, $map) = @_;
+
+	my @segments;
+
+	if ($level == $depth)
+	{
+		push @segments, {
+			header => $header . ", leaf: ${path}xx",
+			label => $label,
+			level => $level,
+			depth => $depth,
+			path => $path,
+			values => $map
+		};
 	}
-	print($out "\t/* $last_comment */") if ($verbose);
-	print $out "\n};\n";
-	close($out);
+	else
+	{
+		my %children;
+
+		while (my ($i, $val) = each %$map)
+		{
+			my $childpath = $path . sprintf("%02x", $i);
+			my $childlabel = "$depth-level-$level-$childpath";
+
+			push @segments, build_segments_recurse($header, $childlabel, $childpath,
+												   $level + 1, $depth, $val);
+			$children{$i} = $childlabel;
+		}
+
+		push @segments, {
+			header => $header . ", byte #$level: ${path}xx",
+			label => $label,
+			level => $level,
+			depth => $depth,
+			path => $path,
+			values => \%children
+		};
+	}
+	return @segments;
+}
+
+#######################################################################
+# make_charmap - convert charset table to charmap hash
+#
+# make_charmap(\@charset, $direction)
+# charset     - ref to charset table : see print_conversion_tables
+# direction   - conversion direction
+#
+sub make_charmap
+{
+	my ($out, $charset, $direction, $verbose) = @_;
+
+	die "unacceptable direction : $direction"
+	  if ($direction != TO_UNICODE && $direction != FROM_UNICODE);
+
+	# In verbose mode, print a large comment with the source and comment of
+	# each character
+	if ($verbose)
+	{
+		print $out "/*\n";
+		print $out "<src>  <dst>    <file>:<lineno> <comment>\n";
+	}
+
+	my %charmap;
+	foreach my $c (@$charset)
+	{
+		# combined characters are handled elsewhere
+		next if (defined $c->{ucs_second});
+
+		next if ($c->{direction} != $direction && $c->{direction} != BOTH);
+
+		my ($src, $dst) =
+			$direction == TO_UNICODE
+			? ($c->{code}, ucs2utf($c->{ucs}))
+			: (ucs2utf($c->{ucs}), $c->{code});
+
+		# check for duplicate source codes
+		if (defined $charmap{$src})
+		{
+			printf STDERR
+				"Error: duplicate source code on %s:%d: 0x%04x => 0x%04x, 0x%04x\n",
+				$c->{f}, $c->{l}, $src, $charmap{$src}, $dst;
+			exit;
+		}
+		$charmap{$src} = $dst;
+
+		if ($verbose)
+		{
+			printf $out "0x%04x 0x%04x %s:%d %s\n", $src, $dst, $c->{f}, $c->{l}, $c->{comment};
+		}
+	}
+	if ($verbose)
+	{
+		print $out "*/\n\n";
+	}
+
+	return \%charmap;
+}
+
+#######################################################################
+# make_charmap_combined - convert charset table to charmap hash
+#     with checking duplicate source code
+#
+# make_charmap_combined(\@charset, $direction)
+# charset     - ref to charset table : see print_conversion_tables
+# direction   - conversion direction
+#
+sub make_charmap_combined
+{
+	my ($charset, $direction) = @_;
+
+	die "unacceptable direction : $direction"
+	  if ($direction != TO_UNICODE && $direction != FROM_UNICODE);
+
+	my @combined;
+	foreach my $c (@$charset)
+	{
+		next if ($c->{direction} != $direction && $c->{direction} != BOTH);
+
+		if (defined $c->{ucs_second})
+		{
+			my $entry = {utf8 => ucs2utf($c->{ucs}),
+						 utf8_second => ucs2utf($c->{ucs_second}),
+						 code => $c->{code},
+						 comment => $c->{comment},
+						 f => $c->{f}, l => $c->{l}};
+			push @combined, $entry;
+		}
+	}
+
+	return \@combined;
+}
+
+#######################################################################
+# convert UCS-4 to UTF-8
+#
+sub ucs2utf
+{
+	my ($ucs) = @_;
+	my $utf;
+
+	if ($ucs <= 0x007f)
+	{
+		$utf = $ucs;
+	}
+	elsif ($ucs > 0x007f && $ucs <= 0x07ff)
+	{
+		$utf = (($ucs & 0x003f) | 0x80) | ((($ucs >> 6) | 0xc0) << 8);
+	}
+	elsif ($ucs > 0x07ff && $ucs <= 0xffff)
+	{
+		$utf =
+		  ((($ucs >> 12) | 0xe0) << 16) |
+		  (((($ucs & 0x0fc0) >> 6) | 0x80) << 8) | (($ucs & 0x003f) | 0x80);
+	}
+	else
+	{
+		$utf =
+		  ((($ucs >> 18) | 0xf0) << 24) |
+		  (((($ucs & 0x3ffff) >> 12) | 0x80) << 16) |
+		  (((($ucs & 0x0fc0) >> 6) | 0x80) << 8) | (($ucs & 0x003f) | 0x80);
+	}
+	return ($utf);
 }
 
 1;