diff options
Diffstat (limited to 'src/common')
-rw-r--r-- | src/common/unicode/Makefile | 4 | ||||
-rw-r--r-- | src/common/unicode/generate-unicode_norm_table.pl | 226 | ||||
-rw-r--r-- | src/common/unicode_norm.c | 106 |
3 files changed, 293 insertions, 43 deletions
diff --git a/src/common/unicode/Makefile b/src/common/unicode/Makefile index 93a9d1615f1..eb14add28ad 100644 --- a/src/common/unicode/Makefile +++ b/src/common/unicode/Makefile @@ -18,7 +18,7 @@ LIBS += $(PTHREAD_LIBS) # By default, do nothing. all: -update-unicode: unicode_norm_table.h unicode_combining_table.h unicode_normprops_table.h +update-unicode: unicode_norm_table.h unicode_combining_table.h unicode_normprops_table.h unicode_norm_hashfunc.h mv $^ ../../../src/include/common/ $(MAKE) normalization-check @@ -30,6 +30,8 @@ UnicodeData.txt DerivedNormalizationProps.txt CompositionExclusions.txt Normaliz # Generation of conversion tables used for string normalization with # UTF-8 strings. +unicode_norm_hashfunc.h: unicode_norm_table.h + unicode_norm_table.h: generate-unicode_norm_table.pl UnicodeData.txt CompositionExclusions.txt $(PERL) generate-unicode_norm_table.pl diff --git a/src/common/unicode/generate-unicode_norm_table.pl b/src/common/unicode/generate-unicode_norm_table.pl index 7ce15e1a039..e4d3ccc2346 100644 --- a/src/common/unicode/generate-unicode_norm_table.pl +++ b/src/common/unicode/generate-unicode_norm_table.pl @@ -1,16 +1,22 @@ #!/usr/bin/perl # -# Generate a composition table, using Unicode data files as input +# Generate a composition table and its lookup utilities, using Unicode data +# files as input. # # Input: UnicodeData.txt and CompositionExclusions.txt -# Output: unicode_norm_table.h +# Output: unicode_norm_table.h and unicode_norm_hashfunc.h # # Copyright (c) 2000-2020, PostgreSQL Global Development Group use strict; use warnings; -my $output_file = "unicode_norm_table.h"; +use FindBin; +use lib "$FindBin::RealBin/../../tools/"; +use PerfectHash; + +my $output_table_file = "unicode_norm_table.h"; +my $output_func_file = "unicode_norm_hashfunc.h"; my $FH; @@ -64,11 +70,13 @@ close $FH; my $num_characters = scalar @characters; -# Start writing out the output file -open my $OUTPUT, '>', $output_file - or die "Could not open output file $output_file: $!\n"; +# Start writing out the output files +open my $OT, '>', $output_table_file + or die "Could not open output file $output_table_file: $!\n"; +open my $OF, '>', $output_func_file + or die "Could not open output file $output_func_file: $!\n"; -print $OUTPUT <<HEADER; +print $OT <<HEADER; /*------------------------------------------------------------------------- * * unicode_norm_table.h @@ -111,8 +119,53 @@ static const pg_unicode_decomposition UnicodeDecompMain[$num_characters] = { HEADER +print $OF <<HEADER; +/*------------------------------------------------------------------------- + * + * unicode_norm_hashfunc.h + * Perfect hash functions used for Unicode normalization + * + * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/common/unicode_norm_hashfunc.h + * + *------------------------------------------------------------------------- + */ + +/* + * File auto-generated by src/common/unicode/generate-unicode_norm_table.pl, + * do not edit. There is deliberately not an #ifndef PG_UNICODE_NORM_HASHFUNC_H + * here. + */ + +#include "common/unicode_norm_table.h" + +/* Typedef for perfect hash functions */ +typedef int (*cp_hash_func) (const void *key); + +/* Information for lookups with perfect hash functions */ +typedef struct +{ + const pg_unicode_decomposition *decomps; + cp_hash_func hash; + int num_decomps; +} pg_unicode_decompinfo; + +typedef struct +{ + const uint16 *inverse_lookup; + cp_hash_func hash; + int num_recomps; +} pg_unicode_recompinfo; + +HEADER + my $decomp_index = 0; my $decomp_string = ""; +my @dec_cp_packed; +my $main_index = 0; +my @rec_info; my $last_code = $characters[-1]->{code}; foreach my $char (@characters) @@ -121,6 +174,9 @@ foreach my $char (@characters) my $class = $char->{class}; my $decomp = $char->{decomp}; + # Save the code point bytes as a string in network order. + push @dec_cp_packed, pack('N', hex($char->{code})); + # The character decomposition mapping field in UnicodeData.txt is a list # of unicode codepoints, separated by space. But it can be prefixed with # so-called compatibility formatting tag, like "<compat>", or "<font>". @@ -163,7 +219,7 @@ foreach my $char (@characters) { foreach my $lcode (@composition_exclusion_codes) { - if ($lcode eq $char->{code}) + if ($lcode eq $code) { $flags .= " | DECOMP_NO_COMPOSE"; $comment = "in exclusion list"; @@ -171,11 +227,26 @@ foreach my $char (@characters) } } } + + # Save info for recomposeable codepoints. + # Note that this MUST match the macro DECOMPOSITION_NO_COMPOSE in C + # above! See also the inverse lookup in recompose_code() found in + # src/common/unicode_norm.c. + if (!($flags =~ /DECOMP_COMPAT/ || $flags =~ /DECOMP_NO_COMPOSE/)) + { + push @rec_info, + { + code => $code, + main_index => $main_index, + first => $first_decomp, + second => $decomp_elts[0] + }; + } } if ($decomp_size == 0) { - print $OUTPUT "\t{0x$code, $class, 0$flags, 0}"; + print $OT "\t{0x$code, $class, 0$flags, 0}"; } elsif ($decomp_size == 1 && length($first_decomp) <= 4) { @@ -183,12 +254,11 @@ foreach my $char (@characters) # The decomposition consists of a single codepoint, and it fits # in a uint16, so we can store it "inline" in the main table. $flags .= " | DECOMP_INLINE"; - print $OUTPUT "\t{0x$code, $class, 1$flags, 0x$first_decomp}"; + print $OT "\t{0x$code, $class, 1$flags, 0x$first_decomp}"; } else { - print $OUTPUT - "\t{0x$code, $class, $decomp_size$flags, $decomp_index}"; + print $OT "\t{0x$code, $class, $decomp_size$flags, $decomp_index}"; # Now save the decompositions into a dedicated area that will # be written afterwards. First build the entry dedicated to @@ -205,25 +275,17 @@ foreach my $char (@characters) } # Print a comma after all items except the last one. - print $OUTPUT "," unless ($code eq $last_code); - if ($comment ne "") - { + print $OT "," unless ($code eq $last_code); - # If the line is wide already, indent the comment with one tab, - # otherwise with two. This is to make the output match the way - # pgindent would mangle it. (This is quite hacky. To do this - # properly, we should actually track how long the line is so far, - # but this works for now.) - print $OUTPUT "\t" if ($decomp_index < 10); + print $OT "\t/* $comment */" if ($comment ne ""); + print $OT "\n"; - print $OUTPUT "\t/* $comment */" if ($comment ne ""); - } - print $OUTPUT "\n"; + $main_index++; } -print $OUTPUT "\n};\n\n"; +print $OT "\n};\n\n"; # Print the array of decomposed codes. -print $OUTPUT <<HEADER; +print $OT <<HEADER; /* codepoints array */ static const uint32 UnicodeDecomp_codepoints[$decomp_index] = { @@ -231,4 +293,114 @@ $decomp_string }; HEADER -close $OUTPUT; +# Emit the definition of the decomp hash function. +my $dec_funcname = 'Decomp_hash_func'; +my $dec_func = PerfectHash::generate_hash_function(\@dec_cp_packed, + $dec_funcname, fixed_key_length => 4); +print $OF "/* Perfect hash function for decomposition */\n"; +print $OF "static $dec_func\n"; + +# Emit the structure that wraps the hash lookup information into +# one variable. +print $OF <<HEADER; +/* Hash lookup information for decomposition */ +static const pg_unicode_decompinfo UnicodeDecompInfo = +{ + UnicodeDecompMain, + $dec_funcname, + $num_characters +}; + +HEADER + +# Find the lowest codepoint that decomposes to each recomposeable +# code pair and create a mapping to it. +my $recomp_string = ""; +my @rec_cp_packed; +my %seenit; +my $firstentry = 1; +foreach my $rec (sort recomp_sort @rec_info) +{ + # The hash key is formed by concatenating the bytes of the two + # codepoints. See also recompose_code() in common/unicode_norm.c. + my $hashkey = (hex($rec->{first}) << 32) | hex($rec->{second}); + + # We are only interested in the lowest code point that decomposes + # to the given code pair. + next if $seenit{$hashkey}; + + # Save the hash key bytes in network order + push @rec_cp_packed, pack('Q>', $hashkey); + + # Append inverse lookup element + $recomp_string .= ",\n" if !$firstentry; + $recomp_string .= sprintf "\t/* U+%s+%s -> U+%s */ %s", + $rec->{first}, + $rec->{second}, + $rec->{code}, + $rec->{main_index}; + + $seenit{$hashkey} = 1; + $firstentry = 0; +} + +# Emit the inverse lookup array containing indexes into UnicodeDecompMain. +my $num_recomps = scalar @rec_cp_packed; +print $OF <<HEADER; +/* Inverse lookup array -- contains indexes into UnicodeDecompMain[] */ +static const uint16 RecompInverseLookup[$num_recomps] = +{ +$recomp_string +}; + +HEADER + +# Emit the definition of the recomposition hash function. +my $rec_funcname = 'Recomp_hash_func'; +my $rec_func = + PerfectHash::generate_hash_function(\@rec_cp_packed, $rec_funcname, + fixed_key_length => 8); +print $OF "/* Perfect hash function for recomposition */\n"; +print $OF "static $rec_func\n"; + +# Emit the structure that wraps the hash lookup information into +# one variable. +print $OF <<HEADER; +/* Hash lookup information for recomposition */ +static const pg_unicode_recompinfo UnicodeRecompInfo = +{ + RecompInverseLookup, + $rec_funcname, + $num_recomps +}; +HEADER + +close $OT; +close $OF; + +sub recomp_sort +{ + my $a1 = hex($a->{first}); + my $b1 = hex($b->{first}); + + my $a2 = hex($a->{second}); + my $b2 = hex($b->{second}); + + # First sort by the first code point + return -1 if $a1 < $b1; + return 1 if $a1 > $b1; + + # Then sort by the second code point + return -1 if $a2 < $b2; + return 1 if $a2 > $b2; + + # Finally sort by the code point that decomposes into first and + # second ones. + my $acode = hex($a->{code}); + my $bcode = hex($b->{code}); + + return -1 if $acode < $bcode; + return -1 if $acode > $bcode; + + die "found duplicate entries of recomposeable code pairs"; +} diff --git a/src/common/unicode_norm.c b/src/common/unicode_norm.c index 4bb6a0f5873..4ffce0e6193 100644 --- a/src/common/unicode_norm.c +++ b/src/common/unicode_norm.c @@ -19,9 +19,11 @@ #endif #include "common/unicode_norm.h" -#include "common/unicode_norm_table.h" #ifndef FRONTEND +#include "common/unicode_norm_hashfunc.h" #include "common/unicode_normprops_table.h" +#else +#include "common/unicode_norm_table.h" #endif #include "port/pg_bswap.h" @@ -44,6 +46,46 @@ #define NCOUNT VCOUNT * TCOUNT #define SCOUNT LCOUNT * NCOUNT +/* + * get_code_entry + * + * Get the entry corresponding to code in the decomposition lookup table. + * The backend version of this code uses a perfect hash function for the + * lookup, while the frontend version uses a binary search. + */ +#ifndef FRONTEND + +static const pg_unicode_decomposition * +get_code_entry(pg_wchar code) +{ + int h; + uint32 hashkey; + pg_unicode_decompinfo decompinfo = UnicodeDecompInfo; + + /* + * Compute the hash function. The hash key is the codepoint with the bytes + * in network order. + */ + hashkey = pg_hton32(code); + h = decompinfo.hash(&hashkey); + + /* An out-of-range result implies no match */ + if (h < 0 || h >= decompinfo.num_decomps) + return NULL; + + /* + * Since it's a perfect hash, we need only match to the specific codepoint + * it identifies. + */ + if (code != decompinfo.decomps[h].codepoint) + return NULL; + + /* Success! */ + return &decompinfo.decomps[h]; +} + +#else + /* comparison routine for bsearch() of decomposition lookup table. */ static int conv_compare(const void *p1, const void *p2) @@ -56,10 +98,7 @@ conv_compare(const void *p1, const void *p2) return (v1 > v2) ? 1 : ((v1 == v2) ? 0 : -1); } -/* - * Get the entry corresponding to code in the decomposition lookup table. - */ -static pg_unicode_decomposition * +static const pg_unicode_decomposition * get_code_entry(pg_wchar code) { return bsearch(&(code), @@ -69,6 +108,8 @@ get_code_entry(pg_wchar code) conv_compare); } +#endif /* !FRONTEND */ + /* * Given a decomposition entry looked up earlier, get the decomposed * characters. @@ -77,7 +118,7 @@ get_code_entry(pg_wchar code) * is only valid until next call to this function! */ static const pg_wchar * -get_code_decomposition(pg_unicode_decomposition *entry, int *dec_size) +get_code_decomposition(const pg_unicode_decomposition *entry, int *dec_size) { static pg_wchar x; @@ -104,7 +145,7 @@ get_code_decomposition(pg_unicode_decomposition *entry, int *dec_size) static int get_decomposed_size(pg_wchar code, bool compat) { - pg_unicode_decomposition *entry; + const pg_unicode_decomposition *entry; int size = 0; int i; const uint32 *decomp; @@ -191,17 +232,51 @@ recompose_code(uint32 start, uint32 code, uint32 *result) } else { - int i; + const pg_unicode_decomposition *entry; /* * Do an inverse lookup of the decomposition tables to see if anything * matches. The comparison just needs to be a perfect match on the * sub-table of size two, because the start character has already been - * recomposed partially. + * recomposed partially. This lookup uses a perfect hash function for + * the backend code. */ +#ifndef FRONTEND + + int h, + inv_lookup_index; + uint64 hashkey; + pg_unicode_recompinfo recompinfo = UnicodeRecompInfo; + + /* + * Compute the hash function. The hash key is formed by concatenating + * bytes of the two codepoints in network order. See also + * src/common/unicode/generate-unicode_norm_table.pl. + */ + hashkey = pg_hton64(((uint64) start << 32) | (uint64) code); + h = recompinfo.hash(&hashkey); + + /* An out-of-range result implies no match */ + if (h < 0 || h >= recompinfo.num_recomps) + return false; + + inv_lookup_index = recompinfo.inverse_lookup[h]; + entry = &UnicodeDecompMain[inv_lookup_index]; + + if (start == UnicodeDecomp_codepoints[entry->dec_index] && + code == UnicodeDecomp_codepoints[entry->dec_index + 1]) + { + *result = entry->codepoint; + return true; + } + +#else + + int i; + for (i = 0; i < lengthof(UnicodeDecompMain); i++) { - const pg_unicode_decomposition *entry = &UnicodeDecompMain[i]; + entry = &UnicodeDecompMain[i]; if (DECOMPOSITION_SIZE(entry) != 2) continue; @@ -216,6 +291,7 @@ recompose_code(uint32 start, uint32 code, uint32 *result) return true; } } +#endif /* !FRONTEND */ } return false; @@ -231,7 +307,7 @@ recompose_code(uint32 start, uint32 code, uint32 *result) static void decompose_code(pg_wchar code, bool compat, pg_wchar **result, int *current) { - pg_unicode_decomposition *entry; + const pg_unicode_decomposition *entry; int i; const uint32 *decomp; int dec_size; @@ -358,8 +434,8 @@ unicode_normalize(UnicodeNormalizationForm form, const pg_wchar *input) pg_wchar prev = decomp_chars[count - 1]; pg_wchar next = decomp_chars[count]; pg_wchar tmp; - pg_unicode_decomposition *prevEntry = get_code_entry(prev); - pg_unicode_decomposition *nextEntry = get_code_entry(next); + const pg_unicode_decomposition *prevEntry = get_code_entry(prev); + const pg_unicode_decomposition *nextEntry = get_code_entry(next); /* * If no entries are found, the character used is either an Hangul @@ -417,7 +493,7 @@ unicode_normalize(UnicodeNormalizationForm form, const pg_wchar *input) for (count = 1; count < decomp_size; count++) { pg_wchar ch = decomp_chars[count]; - pg_unicode_decomposition *ch_entry = get_code_entry(ch); + const pg_unicode_decomposition *ch_entry = get_code_entry(ch); int ch_class = (ch_entry == NULL) ? 0 : ch_entry->comb_class; pg_wchar composite; @@ -458,7 +534,7 @@ unicode_normalize(UnicodeNormalizationForm form, const pg_wchar *input) static uint8 get_canonical_class(pg_wchar ch) { - pg_unicode_decomposition *entry = get_code_entry(ch); + const pg_unicode_decomposition *entry = get_code_entry(ch); if (!entry) return 0; |