diff options
Diffstat (limited to 'src/common/unicode_norm.c')
-rw-r--r-- | src/common/unicode_norm.c | 106 |
1 files changed, 91 insertions, 15 deletions
diff --git a/src/common/unicode_norm.c b/src/common/unicode_norm.c index 4bb6a0f5873..4ffce0e6193 100644 --- a/src/common/unicode_norm.c +++ b/src/common/unicode_norm.c @@ -19,9 +19,11 @@ #endif #include "common/unicode_norm.h" -#include "common/unicode_norm_table.h" #ifndef FRONTEND +#include "common/unicode_norm_hashfunc.h" #include "common/unicode_normprops_table.h" +#else +#include "common/unicode_norm_table.h" #endif #include "port/pg_bswap.h" @@ -44,6 +46,46 @@ #define NCOUNT VCOUNT * TCOUNT #define SCOUNT LCOUNT * NCOUNT +/* + * get_code_entry + * + * Get the entry corresponding to code in the decomposition lookup table. + * The backend version of this code uses a perfect hash function for the + * lookup, while the frontend version uses a binary search. + */ +#ifndef FRONTEND + +static const pg_unicode_decomposition * +get_code_entry(pg_wchar code) +{ + int h; + uint32 hashkey; + pg_unicode_decompinfo decompinfo = UnicodeDecompInfo; + + /* + * Compute the hash function. The hash key is the codepoint with the bytes + * in network order. + */ + hashkey = pg_hton32(code); + h = decompinfo.hash(&hashkey); + + /* An out-of-range result implies no match */ + if (h < 0 || h >= decompinfo.num_decomps) + return NULL; + + /* + * Since it's a perfect hash, we need only match to the specific codepoint + * it identifies. + */ + if (code != decompinfo.decomps[h].codepoint) + return NULL; + + /* Success! */ + return &decompinfo.decomps[h]; +} + +#else + /* comparison routine for bsearch() of decomposition lookup table. */ static int conv_compare(const void *p1, const void *p2) @@ -56,10 +98,7 @@ conv_compare(const void *p1, const void *p2) return (v1 > v2) ? 1 : ((v1 == v2) ? 0 : -1); } -/* - * Get the entry corresponding to code in the decomposition lookup table. - */ -static pg_unicode_decomposition * +static const pg_unicode_decomposition * get_code_entry(pg_wchar code) { return bsearch(&(code), @@ -69,6 +108,8 @@ get_code_entry(pg_wchar code) conv_compare); } +#endif /* !FRONTEND */ + /* * Given a decomposition entry looked up earlier, get the decomposed * characters. @@ -77,7 +118,7 @@ get_code_entry(pg_wchar code) * is only valid until next call to this function! */ static const pg_wchar * -get_code_decomposition(pg_unicode_decomposition *entry, int *dec_size) +get_code_decomposition(const pg_unicode_decomposition *entry, int *dec_size) { static pg_wchar x; @@ -104,7 +145,7 @@ get_code_decomposition(pg_unicode_decomposition *entry, int *dec_size) static int get_decomposed_size(pg_wchar code, bool compat) { - pg_unicode_decomposition *entry; + const pg_unicode_decomposition *entry; int size = 0; int i; const uint32 *decomp; @@ -191,17 +232,51 @@ recompose_code(uint32 start, uint32 code, uint32 *result) } else { - int i; + const pg_unicode_decomposition *entry; /* * Do an inverse lookup of the decomposition tables to see if anything * matches. The comparison just needs to be a perfect match on the * sub-table of size two, because the start character has already been - * recomposed partially. + * recomposed partially. This lookup uses a perfect hash function for + * the backend code. */ +#ifndef FRONTEND + + int h, + inv_lookup_index; + uint64 hashkey; + pg_unicode_recompinfo recompinfo = UnicodeRecompInfo; + + /* + * Compute the hash function. The hash key is formed by concatenating + * bytes of the two codepoints in network order. See also + * src/common/unicode/generate-unicode_norm_table.pl. + */ + hashkey = pg_hton64(((uint64) start << 32) | (uint64) code); + h = recompinfo.hash(&hashkey); + + /* An out-of-range result implies no match */ + if (h < 0 || h >= recompinfo.num_recomps) + return false; + + inv_lookup_index = recompinfo.inverse_lookup[h]; + entry = &UnicodeDecompMain[inv_lookup_index]; + + if (start == UnicodeDecomp_codepoints[entry->dec_index] && + code == UnicodeDecomp_codepoints[entry->dec_index + 1]) + { + *result = entry->codepoint; + return true; + } + +#else + + int i; + for (i = 0; i < lengthof(UnicodeDecompMain); i++) { - const pg_unicode_decomposition *entry = &UnicodeDecompMain[i]; + entry = &UnicodeDecompMain[i]; if (DECOMPOSITION_SIZE(entry) != 2) continue; @@ -216,6 +291,7 @@ recompose_code(uint32 start, uint32 code, uint32 *result) return true; } } +#endif /* !FRONTEND */ } return false; @@ -231,7 +307,7 @@ recompose_code(uint32 start, uint32 code, uint32 *result) static void decompose_code(pg_wchar code, bool compat, pg_wchar **result, int *current) { - pg_unicode_decomposition *entry; + const pg_unicode_decomposition *entry; int i; const uint32 *decomp; int dec_size; @@ -358,8 +434,8 @@ unicode_normalize(UnicodeNormalizationForm form, const pg_wchar *input) pg_wchar prev = decomp_chars[count - 1]; pg_wchar next = decomp_chars[count]; pg_wchar tmp; - pg_unicode_decomposition *prevEntry = get_code_entry(prev); - pg_unicode_decomposition *nextEntry = get_code_entry(next); + const pg_unicode_decomposition *prevEntry = get_code_entry(prev); + const pg_unicode_decomposition *nextEntry = get_code_entry(next); /* * If no entries are found, the character used is either an Hangul @@ -417,7 +493,7 @@ unicode_normalize(UnicodeNormalizationForm form, const pg_wchar *input) for (count = 1; count < decomp_size; count++) { pg_wchar ch = decomp_chars[count]; - pg_unicode_decomposition *ch_entry = get_code_entry(ch); + const pg_unicode_decomposition *ch_entry = get_code_entry(ch); int ch_class = (ch_entry == NULL) ? 0 : ch_entry->comb_class; pg_wchar composite; @@ -458,7 +534,7 @@ unicode_normalize(UnicodeNormalizationForm form, const pg_wchar *input) static uint8 get_canonical_class(pg_wchar ch) { - pg_unicode_decomposition *entry = get_code_entry(ch); + const pg_unicode_decomposition *entry = get_code_entry(ch); if (!entry) return 0; |