diff options
Diffstat (limited to 'src/common/unicode/generate-unicode_case_table.pl')
-rw-r--r-- | src/common/unicode/generate-unicode_case_table.pl | 388 |
1 files changed, 343 insertions, 45 deletions
diff --git a/src/common/unicode/generate-unicode_case_table.pl b/src/common/unicode/generate-unicode_case_table.pl index 953ebef2fe6..3c2edc7ca4d 100644 --- a/src/common/unicode/generate-unicode_case_table.pl +++ b/src/common/unicode/generate-unicode_case_table.pl @@ -231,26 +231,24 @@ while (my $line = <$FH>) close $FH; # assign sequential array indexes to the special mappings -my $special_idx = 0; +# 0 is reserved for NULL +my $special_idx = 1; foreach my $code (sort { $a <=> $b } (keys %special)) { $special{$code}{Index} = $special_idx++; } +# determine size of array +my $num_special = scalar(keys %special) + 1; + +die + "special case map contains $num_special entries which cannot be represented in uint8" + if ($num_special > 256); + # Start writing out the output files open my $OT, '>', $output_table_file or die "Could not open output file $output_table_file: $!\n"; -# determine size of array given that codepoints <= 0x80 are dense and -# the rest of the entries are sparse -my $num_simple = 0x80; -foreach my $code (sort { $a <=> $b } (keys %simple)) -{ - $num_simple++ unless $code < 0x80; -} - -my $num_special = scalar(keys %special) + 1; - print $OT <<"EOS"; /*------------------------------------------------------------------------- * @@ -298,24 +296,17 @@ typedef enum typedef struct { - pg_wchar codepoint; /* Unicode codepoint */ int16 conditions; pg_wchar map[NCaseKind][MAX_CASE_EXPANSION]; } pg_special_case; -typedef struct -{ - pg_wchar codepoint; /* Unicode codepoint */ - pg_wchar simplemap[NCaseKind]; - const pg_special_case *special_case; -} pg_case_map; - /* * Special case mappings that aren't representable in the simple map. * Entries are referenced from simple_case_map. */ static const pg_special_case special_case[$num_special] = { + {0, {{0, 0, 0}, {0, 0, 0}, {0, 0, 0}}}, EOS foreach my $code (sort { $a <=> $b } (keys %special)) @@ -332,23 +323,46 @@ foreach my $code (sort { $a <=> $b } (keys %special)) (map { sprintf "0x%06x", $_ } @{ $special{$code}{Uppercase} }); my $fold = join ", ", (map { sprintf "0x%06x", $_ } @{ $special{$code}{Foldcase} }); - printf $OT "\t{0x%06x, %s, ", $code, $special{$code}{Conditions}; - printf $OT "{{%s}, {%s}, {%s}, {%s}}},\n", $lower, $title, $upper, $fold; + printf $OT "\t{%s, ", $special{$code}{Conditions}; + printf $OT + "{[CaseLower] = {%s},[CaseTitle] = {%s},[CaseUpper] = {%s},[CaseFold] = {%s}}},\n", + $lower, $title, $upper, $fold; } +print $OT "};\n"; -print $OT "\t{0, 0, {{0, 0, 0}, {0, 0, 0}, {0, 0, 0}}}\n"; -print $OT <<"EOS"; -}; - -/* - * Case mapping table. Dense for codepoints < 0x80 (enabling fast lookup), - * sparse for higher codepoints (requiring scan or binary search). - */ -static const pg_case_map case_map[$num_simple] = +# Separate maps for each case form, starting with the reserved entry +# at index 0. The first element is the result code point, and the +# second element is the input code point (which is not ultimately +# stored in the C array, it's just there as a comment). +my %map = ( + lower => [ [ 0, -1 ] ], + title => [ [ 0, -1 ] ], + upper => [ [ 0, -1 ] ], + fold => [ [ 0, -1 ] ], + special => [ [ 0, -1 ] ]); + + +# Current index into the map arrays above. +my $index = 1; + +# Sets of case forms/variations. Simple case pairs have the same set +# of case forms, e.g. the letters 'a' and 'A' both lowercase to 'a'; +# both uppercase to 'A', etc. By tracking unique sets using a hash, we +# cut the size needed for the maps in half (some characters are +# exceptions, so it's not exactly half). The key is an array of all +# case forms, and the value is an index into the maps. +my %case_forms; + +# Perl doesn't allow arrays as hash keys, so we need to transform the +# set of case forms to a scalar. +sub get_hash_key { -EOS + return join ",", @_; +} -printf $OT "\t/* begin dense entries for codepoints < 0x80 */\n"; +# Create map entries for all codepoints < 0x80, so that the caller can +# have a fast-path lookup without needing to go through the main +# table. for (my $code = 0; $code < 0x80; $code++) { my $lc = ($simple{$code}{Simple_Lowercase} || $code); @@ -358,26 +372,310 @@ for (my $code = 0; $code < 0x80; $code++) die "unexpected special case for code $code" if defined $special{$code}; - printf $OT - "\t{0x%06x, {[CaseLower] = 0x%06x,[CaseTitle] = 0x%06x,[CaseUpper] = 0x%06x,[CaseFold] = 0x%06x}, NULL},\n", - $code, $lc, $tc, $uc, $fc; + + push @{ $map{lower} }, [ $lc, $code ]; + push @{ $map{title} }, [ $tc, $code ]; + push @{ $map{upper} }, [ $uc, $code ]; + push @{ $map{fold} }, [ $fc, $code ]; + push @{ $map{special} }, [ 0, $code ]; + + my $key = get_hash_key($lc, $tc, $uc, $fc, 0); + + $simple{$code}{Index} = $index; + $case_forms{$key} = $index++; } -printf $OT "\n"; -printf $OT "\t/* begin sparse entries for codepoints >= 0x80 */\n"; +# Create map entries for all characters >= 0x80 that have case +# mappings (any character with a special case mapping also has an +# entry in %simple). foreach my $code (sort { $a <=> $b } (keys %simple)) { next unless $code >= 0x80; # already output above - my $map = $simple{$code}; - my $special_case = "NULL"; + my $entry = $simple{$code}; + my $special_case = 0; if (exists $special{$code}) { - $special_case = sprintf "&special_case[%d]", $special{$code}{Index}; + $special_case = $special{$code}{Index}; } - printf $OT - "\t{0x%06x, {[CaseLower] = 0x%06x,[CaseTitle] = 0x%06x,[CaseUpper] = 0x%06x,[CaseFold] = 0x%06x}, %s},\n", - $code, $map->{Simple_Lowercase}, $map->{Simple_Titlecase}, - $map->{Simple_Uppercase}, $map->{Simple_Foldcase}, $special_case; + + my $key = get_hash_key( + $entry->{Simple_Lowercase}, $entry->{Simple_Titlecase}, + $entry->{Simple_Uppercase}, $entry->{Simple_Foldcase}, + $special_case); + + unless (exists $case_forms{$key}) + { + $case_forms{$key} = $index++; + + push @{ $map{lower} }, [ $entry->{Simple_Lowercase}, $code ]; + push @{ $map{title} }, [ $entry->{Simple_Titlecase}, $code ]; + push @{ $map{upper} }, [ $entry->{Simple_Uppercase}, $code ]; + push @{ $map{fold} }, [ $entry->{Simple_Foldcase}, $code ]; + push @{ $map{special} }, [ $special_case, $code ]; + } + + $simple{$code}{Index} = $case_forms{$key}; +} + +die + "mapping tables contains $index entries which cannot be represented in uint16" + if ($index > 65536); + +foreach my $kind ('lower', 'title', 'upper', 'fold') +{ + print $OT <<"EOS"; + +/* + * The entry case_map_${kind}[case_index(codepoint)] is the mapping for the + * given codepoint. + */ +static const pg_wchar case_map_$kind\[$index\] = +{ +EOS + + foreach my $entry (@{ $map{$kind} }) + { + my $comment = + @$entry[1] == -1 ? "reserved" : sprintf("U+%06x", @$entry[1]); + print $OT + sprintf("\t0x%06x,\t\t\t\t\t/* %s */\n", @$entry[0], $comment); + } + + print $OT "\n};\n"; +} + +print $OT <<"EOS"; + +/* + * The entry case_map_special[case_index(codepoint)] is the index in + * special_case for that codepoint, or 0 if no special case mapping exists. + */ +static const uint8 case_map_special\[$index\] = +{ +EOS + +foreach my $entry (@{ $map{special} }) +{ + my $s = sprintf("%d,", @$entry[0]); + $s .= "\t" if length($s) < 4; + my $comment = + @$entry[1] == -1 ? "reserved" : sprintf("U+%06x", @$entry[1]); + print $OT sprintf("\t%s\t\t\t\t\t\t/* %s */\n", $s, $comment); +} + +print $OT "\n};\n"; + +print $OT <<"EOS"; + +/* + * Map for each case kind. + */ +static const pg_wchar *casekind_map[NCaseKind] = +{ + [CaseLower] = case_map_lower, + [CaseTitle] = case_map_title, + [CaseUpper] = case_map_upper, + [CaseFold] = case_map_fold, +}; +EOS + +my @codepoints = keys %simple; +my $range = make_ranges(\@codepoints, 500); +my @case_map_lines = range_tables($range); +my $case_map_length = scalar @case_map_lines; +my $case_map_table = join "\n", @case_map_lines; + +print $OT <<"EOS"; + +/* + * Used by case_index() to map a codepoint to an index that can be used in any + * of the following arrays: case_map_lower, case_map_title, case_map_upper, + * case_map_fold. + */ +static const uint16 case_map[$case_map_length] = +{ +$case_map_table +}; + + +EOS + +# First range is the fast path. It must start at codepoint zero, and +# the end is the fastpath limit. Track the limit here and then +# remove it before generating the other branches. +die "first range must start at 0" unless ${ @$range[0] }{Start} == 0; +my $fastpath_limit = sprintf("0x%04X", ${ @$range[0] }{End}); +shift @$range; + +print $OT <<"EOS"; +/* + * case_index() + * + * Given a code point, compute the index in the case_map at which we can find + * the offset into the mapping tables. + */ +static inline uint16 +case_index(pg_wchar cp) +{ + /* Fast path for codepoints < $fastpath_limit */ + if (cp < $fastpath_limit) + { + return case_map[cp]; + } + +EOS + +print $OT join("\n", @{ branch($range, 0, $#$range, 1) }); + +print $OT <<"EOS"; + + + return 0; +} +EOS + +close $OT; + +# The function generates C code with a series of nested if-else conditions +# to search for the matching interval. +sub branch +{ + my ($range, $from, $to, $indent) = @_; + my ($idx, $space, $entry, $table, @result); + + $idx = ($from + int(($to - $from) / 2)); + return \@result unless exists $range->[$idx]; + + $space = "\t" x $indent; + + $entry = $range->[$idx]; + + # IF state + if ($idx == $from) + { + if ($idx == 0) + { + push @result, + sprintf("%sif (cp >= 0x%04X && cp < 0x%04X)\n%s{", + $space, $entry->{Start}, $entry->{End}, $space); + } + else + { + push @result, + sprintf("%sif (cp < 0x%04X)\n%s{", + $space, $entry->{End}, $space); + } + + push @result, + sprintf("%s\treturn case_map[cp - 0x%04X + %d];", + $space, $entry->{Start}, $entry->{Offset}); + } + else + { + push @result, + sprintf("%sif (cp < 0x%04X)\n%s{", $space, $entry->{End}, $space); + push @result, @{ branch($range, $from, $idx - 1, $indent + 1) }; + } + + push @result, $space . "}"; + + # return now if it's the last range + return \@result if $idx == (scalar @$range) - 1; + + # ELSE looks ahead to the next range to avoid adding an + # unnecessary level of branching. + $entry = @$range[ $idx + 1 ]; + + # ELSE state + push @result, + sprintf("%selse if (cp >= 0x%04X)\n%s{", + $space, $entry->{Start}, $space); + + if ($idx == $to) + { + push @result, + sprintf("%s\treturn case_map\[cp - 0x%04X + %d];", + $space, $entry->{Start}, $entry->{Offset}); + } + else + { + push @result, @{ branch($range, $idx + 1, $to, $indent + 1) }; + } + + push @result, $space . "}"; + + return \@result; +} + +# Group numbers into ranges where the difference between neighboring +# elements does not exceed $limit. If the difference is greater, a new +# range is created. This is used to break the sequence into intervals +# where the gaps between numbers are greater than limit. +# +# For example, if there are numbers 1, 2, 3, 5, 6 and limit = 1, then +# there is a difference of 2 between 3 and 5, which is greater than 1, +# so there will be ranges 1-3 and 5-6. +sub make_ranges +{ + my ($nums, $limit) = @_; + my ($prev, $start, $curr, $total, @sorted, @range); + + @sorted = sort { $a <=> $b } @$nums; + + die "expecting at least 2 codepoints" if (scalar @sorted < 2); + + $start = shift @sorted; + + die "expecting first codepoint to start at 0" unless $start == 0; + + $prev = $start; + $total = 0; + + # append final 'undef' to signal final iteration + push @sorted, undef; + + foreach $curr (@sorted) + { + # if last iteration always append the range + if (!defined($curr) || ($curr - $prev > $limit)) + { + push @range, + { + Start => $start, + End => $prev + 1, + Offset => $total + }; + $total += $prev + 1 - $start; + $start = $curr; + } + + $prev = $curr; + } + + return \@range; +} + +# The function combines all ranges into the case_map table. Ranges may +# include codepoints without a case mapping at all, in which case the +# entry in case_map should be zero. +sub range_tables +{ + my ($range) = @_; + my (@lines, @result); + + foreach my $entry (@$range) + { + my $start = $entry->{Start}; + my $end = $entry->{End} - 1; + + foreach my $cp ($start .. $end) + { + my $idx = sprintf("%d,", ($simple{$cp}{Index} || 0)); + $idx .= "\t" if length($idx) < 4; + push @lines, sprintf("\t%s\t\t\t\t\t\t/* U+%06X */", $idx, $cp); + } + } + + return @lines; } -print $OT "};\n"; |