aboutsummaryrefslogtreecommitdiff
path: root/src/common/unicode_norm.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/common/unicode_norm.c')
-rw-r--r--src/common/unicode_norm.c110
1 files changed, 110 insertions, 0 deletions
diff --git a/src/common/unicode_norm.c b/src/common/unicode_norm.c
index ec5abea6bdd..4f4c029075b 100644
--- a/src/common/unicode_norm.c
+++ b/src/common/unicode_norm.c
@@ -20,6 +20,9 @@
#include "common/unicode_norm.h"
#include "common/unicode_norm_table.h"
+#ifndef FRONTEND
+#include "common/unicode_normprops_table.h"
+#endif
#ifndef FRONTEND
#define ALLOC(size) palloc(size)
@@ -442,3 +445,110 @@ unicode_normalize(UnicodeNormalizationForm form, const pg_wchar *input)
return recomp_chars;
}
+
+/*
+ * Normalization "quick check" algorithm; see
+ * <http://www.unicode.org/reports/tr15/#Detecting_Normalization_Forms>
+ */
+
+/* We only need this in the backend. */
+#ifndef FRONTEND
+
+static uint8
+get_canonical_class(pg_wchar ch)
+{
+ pg_unicode_decomposition *entry = get_code_entry(ch);
+
+ if (!entry)
+ return 0;
+ else
+ return entry->comb_class;
+}
+
+static int
+qc_compare(const void *p1, const void *p2)
+{
+ uint32 v1,
+ v2;
+
+ v1 = ((const pg_unicode_normprops *) p1)->codepoint;
+ v2 = ((const pg_unicode_normprops *) p2)->codepoint;
+ return (v1 - v2);
+}
+
+/*
+ * Look up the normalization quick check character property
+ */
+static UnicodeNormalizationQC
+qc_is_allowed(UnicodeNormalizationForm form, pg_wchar ch)
+{
+ pg_unicode_normprops key;
+ pg_unicode_normprops *found = NULL;
+
+ key.codepoint = ch;
+
+ switch (form)
+ {
+ case UNICODE_NFC:
+ found = bsearch(&key,
+ UnicodeNormProps_NFC_QC,
+ lengthof(UnicodeNormProps_NFC_QC),
+ sizeof(pg_unicode_normprops),
+ qc_compare);
+ break;
+ case UNICODE_NFKC:
+ found = bsearch(&key,
+ UnicodeNormProps_NFKC_QC,
+ lengthof(UnicodeNormProps_NFKC_QC),
+ sizeof(pg_unicode_normprops),
+ qc_compare);
+ break;
+ default:
+ Assert(false);
+ break;
+ }
+
+ if (found)
+ return found->quickcheck;
+ else
+ return UNICODE_NORM_QC_YES;
+}
+
+UnicodeNormalizationQC
+unicode_is_normalized_quickcheck(UnicodeNormalizationForm form, const pg_wchar *input)
+{
+ uint8 lastCanonicalClass = 0;
+ UnicodeNormalizationQC result = UNICODE_NORM_QC_YES;
+
+ /*
+ * For the "D" forms, we don't run the quickcheck. We don't include the
+ * lookup tables for those because they are huge, checking for these
+ * particular forms is less common, and running the slow path is faster
+ * for the "D" forms than the "C" forms because you don't need to
+ * recompose, which is slow.
+ */
+ if (form == UNICODE_NFD || form == UNICODE_NFKD)
+ return UNICODE_NORM_QC_MAYBE;
+
+ for (const pg_wchar *p = input; *p; p++)
+ {
+ pg_wchar ch = *p;
+ uint8 canonicalClass;
+ UnicodeNormalizationQC check;
+
+ canonicalClass = get_canonical_class(ch);
+ if (lastCanonicalClass > canonicalClass && canonicalClass != 0)
+ return UNICODE_NORM_QC_NO;
+
+ check = qc_is_allowed(form, ch);
+ if (check == UNICODE_NORM_QC_NO)
+ return UNICODE_NORM_QC_NO;
+ else if (check == UNICODE_NORM_QC_MAYBE)
+ result = UNICODE_NORM_QC_MAYBE;
+
+ lastCanonicalClass = canonicalClass;
+ }
+ return result;
+}
+
+#endif /* !FRONTEND */