]> git.kaiwu.me - njs.git/commitdiff
Introduced UTF-16 according to WHATWG encoding spec.
authorAlexander Borisov <alexander.borisov@nginx.com>
Wed, 15 Jul 2020 16:19:18 +0000 (19:19 +0300)
committerAlexander Borisov <alexander.borisov@nginx.com>
Wed, 15 Jul 2020 16:19:18 +0000 (19:19 +0300)
auto/make
auto/sources
src/njs_main.h
src/njs_unicode.h [new file with mode: 0644]
src/njs_utf16.c [new file with mode: 0644]
src/njs_utf16.h [new file with mode: 0644]
src/test/unicode_unit_test.c [moved from src/test/utf8_unit_test.c with 59% similarity]

index caac3a2e2be883b0395371bbcea1154c51289004..4ec533feefe5de422b51f10df049d35b7d2f8963 100644 (file)
--- a/auto/make
+++ b/auto/make
@@ -241,12 +241,12 @@ lib_test: $NJS_BUILD_DIR/njs_auto_config.h \\
        $NJS_BUILD_DIR/random_unit_test \\
        $NJS_BUILD_DIR/rbtree_unit_test \\
        $NJS_BUILD_DIR/lvlhsh_unit_test \\
-       $NJS_BUILD_DIR/utf8_unit_test
+       $NJS_BUILD_DIR/unicode_unit_test
 
        $NJS_BUILD_DIR/random_unit_test
        $NJS_BUILD_DIR/rbtree_unit_test
        $NJS_BUILD_DIR/lvlhsh_unit_test
-       $NJS_BUILD_DIR/utf8_unit_test
+       $NJS_BUILD_DIR/unicode_unit_test
 
 unit_test: $NJS_BUILD_DIR/njs_auto_config.h \\
        $NJS_BUILD_DIR/njs_unit_test
index d5be8ef532e4aba135d3c514b6c749cc5ed9f146..d9109764a1df98343897fed7fa556ba40b2d6083 100644 (file)
@@ -6,6 +6,7 @@ NJS_LIB_SRCS=" \
    src/njs_murmur_hash.c \
    src/njs_djb_hash.c \
    src/njs_utf8.c \
+   src/njs_utf16.c \
    src/njs_arr.c \
    src/njs_rbtree.c \
    src/njs_lvlhsh.c \
@@ -60,7 +61,7 @@ NJS_LIB_TEST_SRCS=" \
    src/test/lvlhsh_unit_test.c \
    src/test/random_unit_test.c \
    src/test/rbtree_unit_test.c \
-   src/test/utf8_unit_test.c \
+   src/test/unicode_unit_test.c \
 "
 
 NJS_TEST_SRCS=" \
index f03d7f3f1c4152230ce21d3409e61b6c30c450a9..83aeb8afc194d7f37db0fbd7f3df0e370f9c98ca 100644 (file)
@@ -14,7 +14,9 @@
 #include <njs_types.h>
 #include <njs_clang.h>
 #include <njs_str.h>
+#include <njs_unicode.h>
 #include <njs_utf8.h>
+#include <njs_utf16.h>
 #include <njs_diyfp.h>
 #include <njs_dtoa.h>
 #include <njs_dtoa_fixed.h>
diff --git a/src/njs_unicode.h b/src/njs_unicode.h
new file mode 100644 (file)
index 0000000..a2d3214
--- /dev/null
@@ -0,0 +1,23 @@
+
+/*
+ * Copyright (C) Alexander Borisov
+ * Copyright (C) NGINX, Inc.
+ */
+
+#ifndef _NJS_UNICODE_H_INCLUDED_
+#define _NJS_UNICODE_H_INCLUDED_
+
+
+enum {
+    NJS_UNICODE_MAX_CODEPOINT = 0x10FFFF,
+    NJS_UNICODE_ERROR         = 0x1FFFFF,
+    NJS_UNICODE_CONTINUE      = 0x2FFFFF
+};
+
+typedef struct {
+    uint32_t  codepoint;
+    u_char    upper;
+} njs_unicode_decode_t;
+
+
+#endif /* _NJS_UNICODE_H_INCLUDED_ */
diff --git a/src/njs_utf16.c b/src/njs_utf16.c
new file mode 100644 (file)
index 0000000..6626286
--- /dev/null
@@ -0,0 +1,116 @@
+
+/*
+ * Copyright (C) Alexander Borisov
+ * Copyright (C) NGINX, Inc.
+ */
+
+
+#include <njs_main.h>
+
+
+njs_inline void
+njs_utf16_encode_write(uint32_t cp, u_char **start)
+{
+#ifdef NJS_HAVE_BIG_ENDIAN
+        *(*start)++ = cp >> 8;
+        *(*start)++ = cp & 0x00FF;
+#else
+        *(*start)++ = cp & 0x00FF;
+        *(*start)++ = cp >> 8;
+#endif
+}
+
+
+ssize_t
+njs_utf16_encode(uint32_t cp, u_char **start, const u_char *end)
+{
+    if ((*start + 2) > end) {
+        return NJS_ERROR;
+    }
+
+    if (cp < 0x10000) {
+        njs_utf16_encode_write(cp, start);
+
+        return 2;
+    }
+
+    if ((*start + 4) > end) {
+        return NJS_ERROR;
+    }
+
+    cp -= 0x10000;
+
+    njs_utf16_encode_write((0xD800 | (cp >> 0x0A)), start);
+    njs_utf16_encode_write((0xDC00 | (cp & 0x03FF)), start);
+
+    return 4;
+}
+
+
+uint32_t
+njs_utf16_decode(njs_unicode_decode_t *ctx, const u_char **start,
+    const u_char *end)
+{
+    uint32_t  unit;
+    unsigned  lead;
+
+    if (ctx->upper != 0x00) {
+        lead = ctx->upper - 0x01;
+        ctx->upper = 0x00;
+
+        goto lead_state;
+    }
+
+pair_state:
+
+    lead = *(*start)++;
+
+    if (*start >= end) {
+        ctx->upper = lead + 0x01;
+        return NJS_UNICODE_CONTINUE;
+    }
+
+lead_state:
+
+#ifdef NJS_HAVE_BIG_ENDIAN
+        unit = (lead << 8) + *(*start)++;
+#else
+        unit = (*(*start)++ << 8) + lead;
+#endif
+
+    if (ctx->codepoint != 0x00) {
+        if ((unsigned) (unit - 0xDC00) <= (0xDFFF - 0xDC00)) {
+            unit = 0x10000 + ((ctx->codepoint - 0xD800) << 10)
+                   + (unit - 0xDC00);
+
+            ctx->codepoint = 0x00;
+
+            return unit;
+        }
+
+        (*start)--;
+
+        ctx->upper = lead + 0x01;
+        ctx->codepoint = 0x00;
+
+        return NJS_UNICODE_ERROR;
+    }
+
+    /* Surrogate pair. */
+
+    if ((unsigned) (unit - 0xD800) <= (0xDFFF - 0xD800)) {
+        if ((unsigned) (unit - 0xDC00) <= (0xDFFF - 0xDC00)) {
+            return NJS_UNICODE_ERROR;
+        }
+
+        ctx->codepoint = unit;
+
+        if (*start >= end) {
+            return NJS_UNICODE_CONTINUE;
+        }
+
+        goto pair_state;
+    }
+
+    return unit;
+}
diff --git a/src/njs_utf16.h b/src/njs_utf16.h
new file mode 100644 (file)
index 0000000..b567558
--- /dev/null
@@ -0,0 +1,25 @@
+
+/*
+ * Copyright (C) Alexander Borisov
+ * Copyright (C) NGINX, Inc.
+ */
+
+#ifndef _NJS_UTF16_H_INCLUDED_
+#define _NJS_UTF16_H_INCLUDED_
+
+
+NJS_EXPORT ssize_t njs_utf16_encode(uint32_t cp, u_char **start,
+    const u_char *end);
+NJS_EXPORT uint32_t njs_utf16_decode(njs_unicode_decode_t *ctx,
+    const u_char **start, const u_char *end);
+
+
+njs_inline void
+njs_utf16_decode_init(njs_unicode_decode_t *ctx)
+{
+    ctx->upper = 0x00;
+    ctx->codepoint = 0x00;
+}
+
+
+#endif /* _NJS_UTF16_H_INCLUDED_ */
similarity index 59%
rename from src/test/utf8_unit_test.c
rename to src/test/unicode_unit_test.c
index ff627637e38c462cca0b18c161385281f2d2612d..1331f69bbdddd43fa0b1d4db33d653fe6c8b4289 100644 (file)
@@ -9,7 +9,6 @@
 
 
 #define NJS_UTF8_START_TEST  0xC2
-//#define NJS_UTF8_START_TEST  0
 
 
 static u_char  invalid[] = {
@@ -87,7 +86,7 @@ utf8_unit_test(njs_uint_t start)
     njs_uint_t    i, k, l, m;
     const u_char  *pp;
 
-    njs_printf("utf8 unit test started\n");
+    njs_printf("utf8 test started\n");
 
     /* Test valid UTF-8. */
 
@@ -181,7 +180,103 @@ utf8_unit_test(njs_uint_t start)
         return NJS_ERROR;
     }
 
-    njs_printf("utf8 unit test passed\n");
+    njs_printf("utf8 test passed\n");
+    return NJS_OK;
+}
+
+
+static njs_int_t
+utf16_unit_test()
+{
+    int8_t                length, length_to;
+    u_char                *start, *end, *end_to;
+    uint32_t              cp, i;
+    njs_unicode_decode_t  ctx;
+    u_char                buf[8], to[4];
+
+    njs_printf("utf16 test started\n");
+
+    end = buf + sizeof(buf);
+    end_to = to + sizeof(to);
+
+    for (i = 0; i <= NJS_UNICODE_MAX_CODEPOINT; i++) {
+
+        /* Skip surrogate pair. */
+
+        if (i >= 0xD800 && i <= 0xDFFF) {
+            continue;
+        }
+
+        start = buf;
+
+        length = njs_utf16_encode(i, &start, end);
+        if (length < NJS_OK) {
+            njs_printf("utf16 test encode failed\n");
+            return NJS_ERROR;
+        }
+
+        njs_utf16_decode_init(&ctx);
+
+        start = buf;
+
+        cp = njs_utf16_decode(&ctx, (const u_char **) &start, start + length);
+        if (cp > NJS_UNICODE_MAX_CODEPOINT) {
+            njs_printf("utf16 test decode failed\n");
+            return NJS_ERROR;
+        }
+
+        if (cp != i) {
+            njs_printf("utf16 test decode code point does not match\n");
+            return NJS_ERROR;
+        }
+
+        start = to;
+
+        length_to = njs_utf16_encode(cp, &start, end_to);
+        if (length_to < NJS_OK) {
+            njs_printf("utf16 test encode failed\n");
+            return NJS_ERROR;
+        }
+
+        if (length_to != length || njs_strncmp(buf, to, length) != 0) {
+            njs_printf("utf16 test decode-encode failed\n");
+            return NJS_ERROR;
+        }
+    }
+
+    /* Surrogate pair. */
+
+    for (i = 0xD800; i <= 0xDFFF; i++) {
+        start = buf;
+
+        length = njs_utf16_encode(i, &start, end);
+        if (length < NJS_OK) {
+            njs_printf("utf16 test surrogate pair encode lead failed\n");
+            return NJS_ERROR;
+        }
+
+        length_to = njs_utf16_encode(i - 0xD800 + 0xDC00, &start, end);
+        if (length_to < NJS_OK) {
+            njs_printf("utf16 test surrogate pair encode failed\n");
+            return NJS_ERROR;
+        }
+
+        njs_utf16_decode_init(&ctx);
+
+        start = buf;
+
+        cp = njs_utf16_decode(&ctx, (const u_char **) &start,
+                              start + length + length_to);
+        if (cp > NJS_UNICODE_MAX_CODEPOINT) {
+            if (i < 0xDC00) {
+                njs_printf("utf16 test surrogate pair decode failed\n");
+                return NJS_ERROR;
+            }
+        }
+    }
+
+    njs_printf("utf16 test passed\n");
+
     return NJS_OK;
 }
 
@@ -189,8 +284,11 @@ utf8_unit_test(njs_uint_t start)
 int
 main(int argc, char **argv)
 {
+    njs_int_t   ret;
     njs_uint_t  start;
 
+    njs_printf("unicode unit test started\n");
+
     if (argc > 1 && argv[1][0] == 'a') {
         start = NJS_UTF8_START_TEST;
 
@@ -198,5 +296,17 @@ main(int argc, char **argv)
         start = 256;
     }
 
-    return utf8_unit_test(start);
+    ret = utf8_unit_test(start);
+    if (ret != NJS_OK) {
+        return ret;
+    }
+
+    ret = utf16_unit_test();
+    if (ret != NJS_OK) {
+        return ret;
+    }
+
+    njs_printf("unicode unit test passed\n");
+
+    return 0;
 }