aboutsummaryrefslogtreecommitdiff
path: root/src/common/unicode/generate-unicode_case_table.pl
diff options
context:
space:
mode:
Diffstat (limited to 'src/common/unicode/generate-unicode_case_table.pl')
-rw-r--r--src/common/unicode/generate-unicode_case_table.pl388
1 files changed, 343 insertions, 45 deletions
diff --git a/src/common/unicode/generate-unicode_case_table.pl b/src/common/unicode/generate-unicode_case_table.pl
index 953ebef2fe6..3c2edc7ca4d 100644
--- a/src/common/unicode/generate-unicode_case_table.pl
+++ b/src/common/unicode/generate-unicode_case_table.pl
@@ -231,26 +231,24 @@ while (my $line = <$FH>)
close $FH;
# assign sequential array indexes to the special mappings
-my $special_idx = 0;
+# 0 is reserved for NULL
+my $special_idx = 1;
foreach my $code (sort { $a <=> $b } (keys %special))
{
$special{$code}{Index} = $special_idx++;
}
+# determine size of array
+my $num_special = scalar(keys %special) + 1;
+
+die
+ "special case map contains $num_special entries which cannot be represented in uint8"
+ if ($num_special > 256);
+
# Start writing out the output files
open my $OT, '>', $output_table_file
or die "Could not open output file $output_table_file: $!\n";
-# determine size of array given that codepoints <= 0x80 are dense and
-# the rest of the entries are sparse
-my $num_simple = 0x80;
-foreach my $code (sort { $a <=> $b } (keys %simple))
-{
- $num_simple++ unless $code < 0x80;
-}
-
-my $num_special = scalar(keys %special) + 1;
-
print $OT <<"EOS";
/*-------------------------------------------------------------------------
*
@@ -298,24 +296,17 @@ typedef enum
typedef struct
{
- pg_wchar codepoint; /* Unicode codepoint */
int16 conditions;
pg_wchar map[NCaseKind][MAX_CASE_EXPANSION];
} pg_special_case;
-typedef struct
-{
- pg_wchar codepoint; /* Unicode codepoint */
- pg_wchar simplemap[NCaseKind];
- const pg_special_case *special_case;
-} pg_case_map;
-
/*
* Special case mappings that aren't representable in the simple map.
* Entries are referenced from simple_case_map.
*/
static const pg_special_case special_case[$num_special] =
{
+ {0, {{0, 0, 0}, {0, 0, 0}, {0, 0, 0}}},
EOS
foreach my $code (sort { $a <=> $b } (keys %special))
@@ -332,23 +323,46 @@ foreach my $code (sort { $a <=> $b } (keys %special))
(map { sprintf "0x%06x", $_ } @{ $special{$code}{Uppercase} });
my $fold = join ", ",
(map { sprintf "0x%06x", $_ } @{ $special{$code}{Foldcase} });
- printf $OT "\t{0x%06x, %s, ", $code, $special{$code}{Conditions};
- printf $OT "{{%s}, {%s}, {%s}, {%s}}},\n", $lower, $title, $upper, $fold;
+ printf $OT "\t{%s, ", $special{$code}{Conditions};
+ printf $OT
+ "{[CaseLower] = {%s},[CaseTitle] = {%s},[CaseUpper] = {%s},[CaseFold] = {%s}}},\n",
+ $lower, $title, $upper, $fold;
}
+print $OT "};\n";
-print $OT "\t{0, 0, {{0, 0, 0}, {0, 0, 0}, {0, 0, 0}}}\n";
-print $OT <<"EOS";
-};
-
-/*
- * Case mapping table. Dense for codepoints < 0x80 (enabling fast lookup),
- * sparse for higher codepoints (requiring scan or binary search).
- */
-static const pg_case_map case_map[$num_simple] =
+# Separate maps for each case form, starting with the reserved entry
+# at index 0. The first element is the result code point, and the
+# second element is the input code point (which is not ultimately
+# stored in the C array, it's just there as a comment).
+my %map = (
+ lower => [ [ 0, -1 ] ],
+ title => [ [ 0, -1 ] ],
+ upper => [ [ 0, -1 ] ],
+ fold => [ [ 0, -1 ] ],
+ special => [ [ 0, -1 ] ]);
+
+
+# Current index into the map arrays above.
+my $index = 1;
+
+# Sets of case forms/variations. Simple case pairs have the same set
+# of case forms, e.g. the letters 'a' and 'A' both lowercase to 'a';
+# both uppercase to 'A', etc. By tracking unique sets using a hash, we
+# cut the size needed for the maps in half (some characters are
+# exceptions, so it's not exactly half). The key is an array of all
+# case forms, and the value is an index into the maps.
+my %case_forms;
+
+# Perl doesn't allow arrays as hash keys, so we need to transform the
+# set of case forms to a scalar.
+sub get_hash_key
{
-EOS
+ return join ",", @_;
+}
-printf $OT "\t/* begin dense entries for codepoints < 0x80 */\n";
+# Create map entries for all codepoints < 0x80, so that the caller can
+# have a fast-path lookup without needing to go through the main
+# table.
for (my $code = 0; $code < 0x80; $code++)
{
my $lc = ($simple{$code}{Simple_Lowercase} || $code);
@@ -358,26 +372,310 @@ for (my $code = 0; $code < 0x80; $code++)
die "unexpected special case for code $code"
if defined $special{$code};
- printf $OT
- "\t{0x%06x, {[CaseLower] = 0x%06x,[CaseTitle] = 0x%06x,[CaseUpper] = 0x%06x,[CaseFold] = 0x%06x}, NULL},\n",
- $code, $lc, $tc, $uc, $fc;
+
+ push @{ $map{lower} }, [ $lc, $code ];
+ push @{ $map{title} }, [ $tc, $code ];
+ push @{ $map{upper} }, [ $uc, $code ];
+ push @{ $map{fold} }, [ $fc, $code ];
+ push @{ $map{special} }, [ 0, $code ];
+
+ my $key = get_hash_key($lc, $tc, $uc, $fc, 0);
+
+ $simple{$code}{Index} = $index;
+ $case_forms{$key} = $index++;
}
-printf $OT "\n";
-printf $OT "\t/* begin sparse entries for codepoints >= 0x80 */\n";
+# Create map entries for all characters >= 0x80 that have case
+# mappings (any character with a special case mapping also has an
+# entry in %simple).
foreach my $code (sort { $a <=> $b } (keys %simple))
{
next unless $code >= 0x80; # already output above
- my $map = $simple{$code};
- my $special_case = "NULL";
+ my $entry = $simple{$code};
+ my $special_case = 0;
if (exists $special{$code})
{
- $special_case = sprintf "&special_case[%d]", $special{$code}{Index};
+ $special_case = $special{$code}{Index};
}
- printf $OT
- "\t{0x%06x, {[CaseLower] = 0x%06x,[CaseTitle] = 0x%06x,[CaseUpper] = 0x%06x,[CaseFold] = 0x%06x}, %s},\n",
- $code, $map->{Simple_Lowercase}, $map->{Simple_Titlecase},
- $map->{Simple_Uppercase}, $map->{Simple_Foldcase}, $special_case;
+
+ my $key = get_hash_key(
+ $entry->{Simple_Lowercase}, $entry->{Simple_Titlecase},
+ $entry->{Simple_Uppercase}, $entry->{Simple_Foldcase},
+ $special_case);
+
+ unless (exists $case_forms{$key})
+ {
+ $case_forms{$key} = $index++;
+
+ push @{ $map{lower} }, [ $entry->{Simple_Lowercase}, $code ];
+ push @{ $map{title} }, [ $entry->{Simple_Titlecase}, $code ];
+ push @{ $map{upper} }, [ $entry->{Simple_Uppercase}, $code ];
+ push @{ $map{fold} }, [ $entry->{Simple_Foldcase}, $code ];
+ push @{ $map{special} }, [ $special_case, $code ];
+ }
+
+ $simple{$code}{Index} = $case_forms{$key};
+}
+
+die
+ "mapping tables contains $index entries which cannot be represented in uint16"
+ if ($index > 65536);
+
+foreach my $kind ('lower', 'title', 'upper', 'fold')
+{
+ print $OT <<"EOS";
+
+/*
+ * The entry case_map_${kind}[case_index(codepoint)] is the mapping for the
+ * given codepoint.
+ */
+static const pg_wchar case_map_$kind\[$index\] =
+{
+EOS
+
+ foreach my $entry (@{ $map{$kind} })
+ {
+ my $comment =
+ @$entry[1] == -1 ? "reserved" : sprintf("U+%06x", @$entry[1]);
+ print $OT
+ sprintf("\t0x%06x,\t\t\t\t\t/* %s */\n", @$entry[0], $comment);
+ }
+
+ print $OT "\n};\n";
+}
+
+print $OT <<"EOS";
+
+/*
+ * The entry case_map_special[case_index(codepoint)] is the index in
+ * special_case for that codepoint, or 0 if no special case mapping exists.
+ */
+static const uint8 case_map_special\[$index\] =
+{
+EOS
+
+foreach my $entry (@{ $map{special} })
+{
+ my $s = sprintf("%d,", @$entry[0]);
+ $s .= "\t" if length($s) < 4;
+ my $comment =
+ @$entry[1] == -1 ? "reserved" : sprintf("U+%06x", @$entry[1]);
+ print $OT sprintf("\t%s\t\t\t\t\t\t/* %s */\n", $s, $comment);
+}
+
+print $OT "\n};\n";
+
+print $OT <<"EOS";
+
+/*
+ * Map for each case kind.
+ */
+static const pg_wchar *casekind_map[NCaseKind] =
+{
+ [CaseLower] = case_map_lower,
+ [CaseTitle] = case_map_title,
+ [CaseUpper] = case_map_upper,
+ [CaseFold] = case_map_fold,
+};
+EOS
+
+my @codepoints = keys %simple;
+my $range = make_ranges(\@codepoints, 500);
+my @case_map_lines = range_tables($range);
+my $case_map_length = scalar @case_map_lines;
+my $case_map_table = join "\n", @case_map_lines;
+
+print $OT <<"EOS";
+
+/*
+ * Used by case_index() to map a codepoint to an index that can be used in any
+ * of the following arrays: case_map_lower, case_map_title, case_map_upper,
+ * case_map_fold.
+ */
+static const uint16 case_map[$case_map_length] =
+{
+$case_map_table
+};
+
+
+EOS
+
+# First range is the fast path. It must start at codepoint zero, and
+# the end is the fastpath limit. Track the limit here and then
+# remove it before generating the other branches.
+die "first range must start at 0" unless ${ @$range[0] }{Start} == 0;
+my $fastpath_limit = sprintf("0x%04X", ${ @$range[0] }{End});
+shift @$range;
+
+print $OT <<"EOS";
+/*
+ * case_index()
+ *
+ * Given a code point, compute the index in the case_map at which we can find
+ * the offset into the mapping tables.
+ */
+static inline uint16
+case_index(pg_wchar cp)
+{
+ /* Fast path for codepoints < $fastpath_limit */
+ if (cp < $fastpath_limit)
+ {
+ return case_map[cp];
+ }
+
+EOS
+
+print $OT join("\n", @{ branch($range, 0, $#$range, 1) });
+
+print $OT <<"EOS";
+
+
+ return 0;
+}
+EOS
+
+close $OT;
+
+# The function generates C code with a series of nested if-else conditions
+# to search for the matching interval.
+sub branch
+{
+ my ($range, $from, $to, $indent) = @_;
+ my ($idx, $space, $entry, $table, @result);
+
+ $idx = ($from + int(($to - $from) / 2));
+ return \@result unless exists $range->[$idx];
+
+ $space = "\t" x $indent;
+
+ $entry = $range->[$idx];
+
+ # IF state
+ if ($idx == $from)
+ {
+ if ($idx == 0)
+ {
+ push @result,
+ sprintf("%sif (cp >= 0x%04X && cp < 0x%04X)\n%s{",
+ $space, $entry->{Start}, $entry->{End}, $space);
+ }
+ else
+ {
+ push @result,
+ sprintf("%sif (cp < 0x%04X)\n%s{",
+ $space, $entry->{End}, $space);
+ }
+
+ push @result,
+ sprintf("%s\treturn case_map[cp - 0x%04X + %d];",
+ $space, $entry->{Start}, $entry->{Offset});
+ }
+ else
+ {
+ push @result,
+ sprintf("%sif (cp < 0x%04X)\n%s{", $space, $entry->{End}, $space);
+ push @result, @{ branch($range, $from, $idx - 1, $indent + 1) };
+ }
+
+ push @result, $space . "}";
+
+ # return now if it's the last range
+ return \@result if $idx == (scalar @$range) - 1;
+
+ # ELSE looks ahead to the next range to avoid adding an
+ # unnecessary level of branching.
+ $entry = @$range[ $idx + 1 ];
+
+ # ELSE state
+ push @result,
+ sprintf("%selse if (cp >= 0x%04X)\n%s{",
+ $space, $entry->{Start}, $space);
+
+ if ($idx == $to)
+ {
+ push @result,
+ sprintf("%s\treturn case_map\[cp - 0x%04X + %d];",
+ $space, $entry->{Start}, $entry->{Offset});
+ }
+ else
+ {
+ push @result, @{ branch($range, $idx + 1, $to, $indent + 1) };
+ }
+
+ push @result, $space . "}";
+
+ return \@result;
+}
+
+# Group numbers into ranges where the difference between neighboring
+# elements does not exceed $limit. If the difference is greater, a new
+# range is created. This is used to break the sequence into intervals
+# where the gaps between numbers are greater than limit.
+#
+# For example, if there are numbers 1, 2, 3, 5, 6 and limit = 1, then
+# there is a difference of 2 between 3 and 5, which is greater than 1,
+# so there will be ranges 1-3 and 5-6.
+sub make_ranges
+{
+ my ($nums, $limit) = @_;
+ my ($prev, $start, $curr, $total, @sorted, @range);
+
+ @sorted = sort { $a <=> $b } @$nums;
+
+ die "expecting at least 2 codepoints" if (scalar @sorted < 2);
+
+ $start = shift @sorted;
+
+ die "expecting first codepoint to start at 0" unless $start == 0;
+
+ $prev = $start;
+ $total = 0;
+
+ # append final 'undef' to signal final iteration
+ push @sorted, undef;
+
+ foreach $curr (@sorted)
+ {
+ # if last iteration always append the range
+ if (!defined($curr) || ($curr - $prev > $limit))
+ {
+ push @range,
+ {
+ Start => $start,
+ End => $prev + 1,
+ Offset => $total
+ };
+ $total += $prev + 1 - $start;
+ $start = $curr;
+ }
+
+ $prev = $curr;
+ }
+
+ return \@range;
+}
+
+# The function combines all ranges into the case_map table. Ranges may
+# include codepoints without a case mapping at all, in which case the
+# entry in case_map should be zero.
+sub range_tables
+{
+ my ($range) = @_;
+ my (@lines, @result);
+
+ foreach my $entry (@$range)
+ {
+ my $start = $entry->{Start};
+ my $end = $entry->{End} - 1;
+
+ foreach my $cp ($start .. $end)
+ {
+ my $idx = sprintf("%d,", ($simple{$cp}{Index} || 0));
+ $idx .= "\t" if length($idx) < 4;
+ push @lines, sprintf("\t%s\t\t\t\t\t\t/* U+%06X */", $idx, $cp);
+ }
+ }
+
+ return @lines;
}
-print $OT "};\n";