aboutsummaryrefslogtreecommitdiff
path: root/src/common
diff options
context:
space:
mode:
Diffstat (limited to 'src/common')
-rw-r--r--src/common/unicode/Makefile4
-rw-r--r--src/common/unicode/generate-unicode_norm_table.pl226
-rw-r--r--src/common/unicode_norm.c106
3 files changed, 293 insertions, 43 deletions
diff --git a/src/common/unicode/Makefile b/src/common/unicode/Makefile
index 93a9d1615f1..eb14add28ad 100644
--- a/src/common/unicode/Makefile
+++ b/src/common/unicode/Makefile
@@ -18,7 +18,7 @@ LIBS += $(PTHREAD_LIBS)
# By default, do nothing.
all:
-update-unicode: unicode_norm_table.h unicode_combining_table.h unicode_normprops_table.h
+update-unicode: unicode_norm_table.h unicode_combining_table.h unicode_normprops_table.h unicode_norm_hashfunc.h
mv $^ ../../../src/include/common/
$(MAKE) normalization-check
@@ -30,6 +30,8 @@ UnicodeData.txt DerivedNormalizationProps.txt CompositionExclusions.txt Normaliz
# Generation of conversion tables used for string normalization with
# UTF-8 strings.
+unicode_norm_hashfunc.h: unicode_norm_table.h
+
unicode_norm_table.h: generate-unicode_norm_table.pl UnicodeData.txt CompositionExclusions.txt
$(PERL) generate-unicode_norm_table.pl
diff --git a/src/common/unicode/generate-unicode_norm_table.pl b/src/common/unicode/generate-unicode_norm_table.pl
index 7ce15e1a039..e4d3ccc2346 100644
--- a/src/common/unicode/generate-unicode_norm_table.pl
+++ b/src/common/unicode/generate-unicode_norm_table.pl
@@ -1,16 +1,22 @@
#!/usr/bin/perl
#
-# Generate a composition table, using Unicode data files as input
+# Generate a composition table and its lookup utilities, using Unicode data
+# files as input.
#
# Input: UnicodeData.txt and CompositionExclusions.txt
-# Output: unicode_norm_table.h
+# Output: unicode_norm_table.h and unicode_norm_hashfunc.h
#
# Copyright (c) 2000-2020, PostgreSQL Global Development Group
use strict;
use warnings;
-my $output_file = "unicode_norm_table.h";
+use FindBin;
+use lib "$FindBin::RealBin/../../tools/";
+use PerfectHash;
+
+my $output_table_file = "unicode_norm_table.h";
+my $output_func_file = "unicode_norm_hashfunc.h";
my $FH;
@@ -64,11 +70,13 @@ close $FH;
my $num_characters = scalar @characters;
-# Start writing out the output file
-open my $OUTPUT, '>', $output_file
- or die "Could not open output file $output_file: $!\n";
+# Start writing out the output files
+open my $OT, '>', $output_table_file
+ or die "Could not open output file $output_table_file: $!\n";
+open my $OF, '>', $output_func_file
+ or die "Could not open output file $output_func_file: $!\n";
-print $OUTPUT <<HEADER;
+print $OT <<HEADER;
/*-------------------------------------------------------------------------
*
* unicode_norm_table.h
@@ -111,8 +119,53 @@ static const pg_unicode_decomposition UnicodeDecompMain[$num_characters] =
{
HEADER
+print $OF <<HEADER;
+/*-------------------------------------------------------------------------
+ *
+ * unicode_norm_hashfunc.h
+ * Perfect hash functions used for Unicode normalization
+ *
+ * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/common/unicode_norm_hashfunc.h
+ *
+ *-------------------------------------------------------------------------
+ */
+
+/*
+ * File auto-generated by src/common/unicode/generate-unicode_norm_table.pl,
+ * do not edit. There is deliberately not an #ifndef PG_UNICODE_NORM_HASHFUNC_H
+ * here.
+ */
+
+#include "common/unicode_norm_table.h"
+
+/* Typedef for perfect hash functions */
+typedef int (*cp_hash_func) (const void *key);
+
+/* Information for lookups with perfect hash functions */
+typedef struct
+{
+ const pg_unicode_decomposition *decomps;
+ cp_hash_func hash;
+ int num_decomps;
+} pg_unicode_decompinfo;
+
+typedef struct
+{
+ const uint16 *inverse_lookup;
+ cp_hash_func hash;
+ int num_recomps;
+} pg_unicode_recompinfo;
+
+HEADER
+
my $decomp_index = 0;
my $decomp_string = "";
+my @dec_cp_packed;
+my $main_index = 0;
+my @rec_info;
my $last_code = $characters[-1]->{code};
foreach my $char (@characters)
@@ -121,6 +174,9 @@ foreach my $char (@characters)
my $class = $char->{class};
my $decomp = $char->{decomp};
+ # Save the code point bytes as a string in network order.
+ push @dec_cp_packed, pack('N', hex($char->{code}));
+
# The character decomposition mapping field in UnicodeData.txt is a list
# of unicode codepoints, separated by space. But it can be prefixed with
# so-called compatibility formatting tag, like "<compat>", or "<font>".
@@ -163,7 +219,7 @@ foreach my $char (@characters)
{
foreach my $lcode (@composition_exclusion_codes)
{
- if ($lcode eq $char->{code})
+ if ($lcode eq $code)
{
$flags .= " | DECOMP_NO_COMPOSE";
$comment = "in exclusion list";
@@ -171,11 +227,26 @@ foreach my $char (@characters)
}
}
}
+
+ # Save info for recomposeable codepoints.
+ # Note that this MUST match the macro DECOMPOSITION_NO_COMPOSE in C
+ # above! See also the inverse lookup in recompose_code() found in
+ # src/common/unicode_norm.c.
+ if (!($flags =~ /DECOMP_COMPAT/ || $flags =~ /DECOMP_NO_COMPOSE/))
+ {
+ push @rec_info,
+ {
+ code => $code,
+ main_index => $main_index,
+ first => $first_decomp,
+ second => $decomp_elts[0]
+ };
+ }
}
if ($decomp_size == 0)
{
- print $OUTPUT "\t{0x$code, $class, 0$flags, 0}";
+ print $OT "\t{0x$code, $class, 0$flags, 0}";
}
elsif ($decomp_size == 1 && length($first_decomp) <= 4)
{
@@ -183,12 +254,11 @@ foreach my $char (@characters)
# The decomposition consists of a single codepoint, and it fits
# in a uint16, so we can store it "inline" in the main table.
$flags .= " | DECOMP_INLINE";
- print $OUTPUT "\t{0x$code, $class, 1$flags, 0x$first_decomp}";
+ print $OT "\t{0x$code, $class, 1$flags, 0x$first_decomp}";
}
else
{
- print $OUTPUT
- "\t{0x$code, $class, $decomp_size$flags, $decomp_index}";
+ print $OT "\t{0x$code, $class, $decomp_size$flags, $decomp_index}";
# Now save the decompositions into a dedicated area that will
# be written afterwards. First build the entry dedicated to
@@ -205,25 +275,17 @@ foreach my $char (@characters)
}
# Print a comma after all items except the last one.
- print $OUTPUT "," unless ($code eq $last_code);
- if ($comment ne "")
- {
+ print $OT "," unless ($code eq $last_code);
- # If the line is wide already, indent the comment with one tab,
- # otherwise with two. This is to make the output match the way
- # pgindent would mangle it. (This is quite hacky. To do this
- # properly, we should actually track how long the line is so far,
- # but this works for now.)
- print $OUTPUT "\t" if ($decomp_index < 10);
+ print $OT "\t/* $comment */" if ($comment ne "");
+ print $OT "\n";
- print $OUTPUT "\t/* $comment */" if ($comment ne "");
- }
- print $OUTPUT "\n";
+ $main_index++;
}
-print $OUTPUT "\n};\n\n";
+print $OT "\n};\n\n";
# Print the array of decomposed codes.
-print $OUTPUT <<HEADER;
+print $OT <<HEADER;
/* codepoints array */
static const uint32 UnicodeDecomp_codepoints[$decomp_index] =
{
@@ -231,4 +293,114 @@ $decomp_string
};
HEADER
-close $OUTPUT;
+# Emit the definition of the decomp hash function.
+my $dec_funcname = 'Decomp_hash_func';
+my $dec_func = PerfectHash::generate_hash_function(\@dec_cp_packed,
+ $dec_funcname, fixed_key_length => 4);
+print $OF "/* Perfect hash function for decomposition */\n";
+print $OF "static $dec_func\n";
+
+# Emit the structure that wraps the hash lookup information into
+# one variable.
+print $OF <<HEADER;
+/* Hash lookup information for decomposition */
+static const pg_unicode_decompinfo UnicodeDecompInfo =
+{
+ UnicodeDecompMain,
+ $dec_funcname,
+ $num_characters
+};
+
+HEADER
+
+# Find the lowest codepoint that decomposes to each recomposeable
+# code pair and create a mapping to it.
+my $recomp_string = "";
+my @rec_cp_packed;
+my %seenit;
+my $firstentry = 1;
+foreach my $rec (sort recomp_sort @rec_info)
+{
+ # The hash key is formed by concatenating the bytes of the two
+ # codepoints. See also recompose_code() in common/unicode_norm.c.
+ my $hashkey = (hex($rec->{first}) << 32) | hex($rec->{second});
+
+ # We are only interested in the lowest code point that decomposes
+ # to the given code pair.
+ next if $seenit{$hashkey};
+
+ # Save the hash key bytes in network order
+ push @rec_cp_packed, pack('Q>', $hashkey);
+
+ # Append inverse lookup element
+ $recomp_string .= ",\n" if !$firstentry;
+ $recomp_string .= sprintf "\t/* U+%s+%s -> U+%s */ %s",
+ $rec->{first},
+ $rec->{second},
+ $rec->{code},
+ $rec->{main_index};
+
+ $seenit{$hashkey} = 1;
+ $firstentry = 0;
+}
+
+# Emit the inverse lookup array containing indexes into UnicodeDecompMain.
+my $num_recomps = scalar @rec_cp_packed;
+print $OF <<HEADER;
+/* Inverse lookup array -- contains indexes into UnicodeDecompMain[] */
+static const uint16 RecompInverseLookup[$num_recomps] =
+{
+$recomp_string
+};
+
+HEADER
+
+# Emit the definition of the recomposition hash function.
+my $rec_funcname = 'Recomp_hash_func';
+my $rec_func =
+ PerfectHash::generate_hash_function(\@rec_cp_packed, $rec_funcname,
+ fixed_key_length => 8);
+print $OF "/* Perfect hash function for recomposition */\n";
+print $OF "static $rec_func\n";
+
+# Emit the structure that wraps the hash lookup information into
+# one variable.
+print $OF <<HEADER;
+/* Hash lookup information for recomposition */
+static const pg_unicode_recompinfo UnicodeRecompInfo =
+{
+ RecompInverseLookup,
+ $rec_funcname,
+ $num_recomps
+};
+HEADER
+
+close $OT;
+close $OF;
+
+sub recomp_sort
+{
+ my $a1 = hex($a->{first});
+ my $b1 = hex($b->{first});
+
+ my $a2 = hex($a->{second});
+ my $b2 = hex($b->{second});
+
+ # First sort by the first code point
+ return -1 if $a1 < $b1;
+ return 1 if $a1 > $b1;
+
+ # Then sort by the second code point
+ return -1 if $a2 < $b2;
+ return 1 if $a2 > $b2;
+
+ # Finally sort by the code point that decomposes into first and
+ # second ones.
+ my $acode = hex($a->{code});
+ my $bcode = hex($b->{code});
+
+ return -1 if $acode < $bcode;
+ return -1 if $acode > $bcode;
+
+ die "found duplicate entries of recomposeable code pairs";
+}
diff --git a/src/common/unicode_norm.c b/src/common/unicode_norm.c
index 4bb6a0f5873..4ffce0e6193 100644
--- a/src/common/unicode_norm.c
+++ b/src/common/unicode_norm.c
@@ -19,9 +19,11 @@
#endif
#include "common/unicode_norm.h"
-#include "common/unicode_norm_table.h"
#ifndef FRONTEND
+#include "common/unicode_norm_hashfunc.h"
#include "common/unicode_normprops_table.h"
+#else
+#include "common/unicode_norm_table.h"
#endif
#include "port/pg_bswap.h"
@@ -44,6 +46,46 @@
#define NCOUNT VCOUNT * TCOUNT
#define SCOUNT LCOUNT * NCOUNT
+/*
+ * get_code_entry
+ *
+ * Get the entry corresponding to code in the decomposition lookup table.
+ * The backend version of this code uses a perfect hash function for the
+ * lookup, while the frontend version uses a binary search.
+ */
+#ifndef FRONTEND
+
+static const pg_unicode_decomposition *
+get_code_entry(pg_wchar code)
+{
+ int h;
+ uint32 hashkey;
+ pg_unicode_decompinfo decompinfo = UnicodeDecompInfo;
+
+ /*
+ * Compute the hash function. The hash key is the codepoint with the bytes
+ * in network order.
+ */
+ hashkey = pg_hton32(code);
+ h = decompinfo.hash(&hashkey);
+
+ /* An out-of-range result implies no match */
+ if (h < 0 || h >= decompinfo.num_decomps)
+ return NULL;
+
+ /*
+ * Since it's a perfect hash, we need only match to the specific codepoint
+ * it identifies.
+ */
+ if (code != decompinfo.decomps[h].codepoint)
+ return NULL;
+
+ /* Success! */
+ return &decompinfo.decomps[h];
+}
+
+#else
+
/* comparison routine for bsearch() of decomposition lookup table. */
static int
conv_compare(const void *p1, const void *p2)
@@ -56,10 +98,7 @@ conv_compare(const void *p1, const void *p2)
return (v1 > v2) ? 1 : ((v1 == v2) ? 0 : -1);
}
-/*
- * Get the entry corresponding to code in the decomposition lookup table.
- */
-static pg_unicode_decomposition *
+static const pg_unicode_decomposition *
get_code_entry(pg_wchar code)
{
return bsearch(&(code),
@@ -69,6 +108,8 @@ get_code_entry(pg_wchar code)
conv_compare);
}
+#endif /* !FRONTEND */
+
/*
* Given a decomposition entry looked up earlier, get the decomposed
* characters.
@@ -77,7 +118,7 @@ get_code_entry(pg_wchar code)
* is only valid until next call to this function!
*/
static const pg_wchar *
-get_code_decomposition(pg_unicode_decomposition *entry, int *dec_size)
+get_code_decomposition(const pg_unicode_decomposition *entry, int *dec_size)
{
static pg_wchar x;
@@ -104,7 +145,7 @@ get_code_decomposition(pg_unicode_decomposition *entry, int *dec_size)
static int
get_decomposed_size(pg_wchar code, bool compat)
{
- pg_unicode_decomposition *entry;
+ const pg_unicode_decomposition *entry;
int size = 0;
int i;
const uint32 *decomp;
@@ -191,17 +232,51 @@ recompose_code(uint32 start, uint32 code, uint32 *result)
}
else
{
- int i;
+ const pg_unicode_decomposition *entry;
/*
* Do an inverse lookup of the decomposition tables to see if anything
* matches. The comparison just needs to be a perfect match on the
* sub-table of size two, because the start character has already been
- * recomposed partially.
+ * recomposed partially. This lookup uses a perfect hash function for
+ * the backend code.
*/
+#ifndef FRONTEND
+
+ int h,
+ inv_lookup_index;
+ uint64 hashkey;
+ pg_unicode_recompinfo recompinfo = UnicodeRecompInfo;
+
+ /*
+ * Compute the hash function. The hash key is formed by concatenating
+ * bytes of the two codepoints in network order. See also
+ * src/common/unicode/generate-unicode_norm_table.pl.
+ */
+ hashkey = pg_hton64(((uint64) start << 32) | (uint64) code);
+ h = recompinfo.hash(&hashkey);
+
+ /* An out-of-range result implies no match */
+ if (h < 0 || h >= recompinfo.num_recomps)
+ return false;
+
+ inv_lookup_index = recompinfo.inverse_lookup[h];
+ entry = &UnicodeDecompMain[inv_lookup_index];
+
+ if (start == UnicodeDecomp_codepoints[entry->dec_index] &&
+ code == UnicodeDecomp_codepoints[entry->dec_index + 1])
+ {
+ *result = entry->codepoint;
+ return true;
+ }
+
+#else
+
+ int i;
+
for (i = 0; i < lengthof(UnicodeDecompMain); i++)
{
- const pg_unicode_decomposition *entry = &UnicodeDecompMain[i];
+ entry = &UnicodeDecompMain[i];
if (DECOMPOSITION_SIZE(entry) != 2)
continue;
@@ -216,6 +291,7 @@ recompose_code(uint32 start, uint32 code, uint32 *result)
return true;
}
}
+#endif /* !FRONTEND */
}
return false;
@@ -231,7 +307,7 @@ recompose_code(uint32 start, uint32 code, uint32 *result)
static void
decompose_code(pg_wchar code, bool compat, pg_wchar **result, int *current)
{
- pg_unicode_decomposition *entry;
+ const pg_unicode_decomposition *entry;
int i;
const uint32 *decomp;
int dec_size;
@@ -358,8 +434,8 @@ unicode_normalize(UnicodeNormalizationForm form, const pg_wchar *input)
pg_wchar prev = decomp_chars[count - 1];
pg_wchar next = decomp_chars[count];
pg_wchar tmp;
- pg_unicode_decomposition *prevEntry = get_code_entry(prev);
- pg_unicode_decomposition *nextEntry = get_code_entry(next);
+ const pg_unicode_decomposition *prevEntry = get_code_entry(prev);
+ const pg_unicode_decomposition *nextEntry = get_code_entry(next);
/*
* If no entries are found, the character used is either an Hangul
@@ -417,7 +493,7 @@ unicode_normalize(UnicodeNormalizationForm form, const pg_wchar *input)
for (count = 1; count < decomp_size; count++)
{
pg_wchar ch = decomp_chars[count];
- pg_unicode_decomposition *ch_entry = get_code_entry(ch);
+ const pg_unicode_decomposition *ch_entry = get_code_entry(ch);
int ch_class = (ch_entry == NULL) ? 0 : ch_entry->comb_class;
pg_wchar composite;
@@ -458,7 +534,7 @@ unicode_normalize(UnicodeNormalizationForm form, const pg_wchar *input)
static uint8
get_canonical_class(pg_wchar ch)
{
- pg_unicode_decomposition *entry = get_code_entry(ch);
+ const pg_unicode_decomposition *entry = get_code_entry(ch);
if (!entry)
return 0;