$NJS_BUILD_DIR/random_unit_test \\
$NJS_BUILD_DIR/rbtree_unit_test \\
$NJS_BUILD_DIR/lvlhsh_unit_test \\
- $NJS_BUILD_DIR/utf8_unit_test
+ $NJS_BUILD_DIR/unicode_unit_test
$NJS_BUILD_DIR/random_unit_test
$NJS_BUILD_DIR/rbtree_unit_test
$NJS_BUILD_DIR/lvlhsh_unit_test
- $NJS_BUILD_DIR/utf8_unit_test
+ $NJS_BUILD_DIR/unicode_unit_test
unit_test: $NJS_BUILD_DIR/njs_auto_config.h \\
$NJS_BUILD_DIR/njs_unit_test
src/njs_murmur_hash.c \
src/njs_djb_hash.c \
src/njs_utf8.c \
+ src/njs_utf16.c \
src/njs_arr.c \
src/njs_rbtree.c \
src/njs_lvlhsh.c \
src/test/lvlhsh_unit_test.c \
src/test/random_unit_test.c \
src/test/rbtree_unit_test.c \
- src/test/utf8_unit_test.c \
+ src/test/unicode_unit_test.c \
"
NJS_TEST_SRCS=" \
#include <njs_types.h>
#include <njs_clang.h>
#include <njs_str.h>
+#include <njs_unicode.h>
#include <njs_utf8.h>
+#include <njs_utf16.h>
#include <njs_diyfp.h>
#include <njs_dtoa.h>
#include <njs_dtoa_fixed.h>
--- /dev/null
+
+/*
+ * Copyright (C) Alexander Borisov
+ * Copyright (C) NGINX, Inc.
+ */
+
+#ifndef _NJS_UNICODE_H_INCLUDED_
+#define _NJS_UNICODE_H_INCLUDED_
+
+
+enum {
+ NJS_UNICODE_MAX_CODEPOINT = 0x10FFFF,
+ NJS_UNICODE_ERROR = 0x1FFFFF,
+ NJS_UNICODE_CONTINUE = 0x2FFFFF
+};
+
+typedef struct {
+ uint32_t codepoint;
+ u_char upper;
+} njs_unicode_decode_t;
+
+
+#endif /* _NJS_UNICODE_H_INCLUDED_ */
--- /dev/null
+
+/*
+ * Copyright (C) Alexander Borisov
+ * Copyright (C) NGINX, Inc.
+ */
+
+
+#include <njs_main.h>
+
+
+njs_inline void
+njs_utf16_encode_write(uint32_t cp, u_char **start)
+{
+#ifdef NJS_HAVE_BIG_ENDIAN
+ *(*start)++ = cp >> 8;
+ *(*start)++ = cp & 0x00FF;
+#else
+ *(*start)++ = cp & 0x00FF;
+ *(*start)++ = cp >> 8;
+#endif
+}
+
+
+ssize_t
+njs_utf16_encode(uint32_t cp, u_char **start, const u_char *end)
+{
+ if ((*start + 2) > end) {
+ return NJS_ERROR;
+ }
+
+ if (cp < 0x10000) {
+ njs_utf16_encode_write(cp, start);
+
+ return 2;
+ }
+
+ if ((*start + 4) > end) {
+ return NJS_ERROR;
+ }
+
+ cp -= 0x10000;
+
+ njs_utf16_encode_write((0xD800 | (cp >> 0x0A)), start);
+ njs_utf16_encode_write((0xDC00 | (cp & 0x03FF)), start);
+
+ return 4;
+}
+
+
+uint32_t
+njs_utf16_decode(njs_unicode_decode_t *ctx, const u_char **start,
+ const u_char *end)
+{
+ uint32_t unit;
+ unsigned lead;
+
+ if (ctx->upper != 0x00) {
+ lead = ctx->upper - 0x01;
+ ctx->upper = 0x00;
+
+ goto lead_state;
+ }
+
+pair_state:
+
+ lead = *(*start)++;
+
+ if (*start >= end) {
+ ctx->upper = lead + 0x01;
+ return NJS_UNICODE_CONTINUE;
+ }
+
+lead_state:
+
+#ifdef NJS_HAVE_BIG_ENDIAN
+ unit = (lead << 8) + *(*start)++;
+#else
+ unit = (*(*start)++ << 8) + lead;
+#endif
+
+ if (ctx->codepoint != 0x00) {
+ if ((unsigned) (unit - 0xDC00) <= (0xDFFF - 0xDC00)) {
+ unit = 0x10000 + ((ctx->codepoint - 0xD800) << 10)
+ + (unit - 0xDC00);
+
+ ctx->codepoint = 0x00;
+
+ return unit;
+ }
+
+ (*start)--;
+
+ ctx->upper = lead + 0x01;
+ ctx->codepoint = 0x00;
+
+ return NJS_UNICODE_ERROR;
+ }
+
+ /* Surrogate pair. */
+
+ if ((unsigned) (unit - 0xD800) <= (0xDFFF - 0xD800)) {
+ if ((unsigned) (unit - 0xDC00) <= (0xDFFF - 0xDC00)) {
+ return NJS_UNICODE_ERROR;
+ }
+
+ ctx->codepoint = unit;
+
+ if (*start >= end) {
+ return NJS_UNICODE_CONTINUE;
+ }
+
+ goto pair_state;
+ }
+
+ return unit;
+}
--- /dev/null
+
+/*
+ * Copyright (C) Alexander Borisov
+ * Copyright (C) NGINX, Inc.
+ */
+
+#ifndef _NJS_UTF16_H_INCLUDED_
+#define _NJS_UTF16_H_INCLUDED_
+
+
+NJS_EXPORT ssize_t njs_utf16_encode(uint32_t cp, u_char **start,
+ const u_char *end);
+NJS_EXPORT uint32_t njs_utf16_decode(njs_unicode_decode_t *ctx,
+ const u_char **start, const u_char *end);
+
+
+njs_inline void
+njs_utf16_decode_init(njs_unicode_decode_t *ctx)
+{
+ ctx->upper = 0x00;
+ ctx->codepoint = 0x00;
+}
+
+
+#endif /* _NJS_UTF16_H_INCLUDED_ */
#define NJS_UTF8_START_TEST 0xC2
-//#define NJS_UTF8_START_TEST 0
static u_char invalid[] = {
njs_uint_t i, k, l, m;
const u_char *pp;
- njs_printf("utf8 unit test started\n");
+ njs_printf("utf8 test started\n");
/* Test valid UTF-8. */
return NJS_ERROR;
}
- njs_printf("utf8 unit test passed\n");
+ njs_printf("utf8 test passed\n");
+ return NJS_OK;
+}
+
+
+static njs_int_t
+utf16_unit_test()
+{
+ int8_t length, length_to;
+ u_char *start, *end, *end_to;
+ uint32_t cp, i;
+ njs_unicode_decode_t ctx;
+ u_char buf[8], to[4];
+
+ njs_printf("utf16 test started\n");
+
+ end = buf + sizeof(buf);
+ end_to = to + sizeof(to);
+
+ for (i = 0; i <= NJS_UNICODE_MAX_CODEPOINT; i++) {
+
+ /* Skip surrogate pair. */
+
+ if (i >= 0xD800 && i <= 0xDFFF) {
+ continue;
+ }
+
+ start = buf;
+
+ length = njs_utf16_encode(i, &start, end);
+ if (length < NJS_OK) {
+ njs_printf("utf16 test encode failed\n");
+ return NJS_ERROR;
+ }
+
+ njs_utf16_decode_init(&ctx);
+
+ start = buf;
+
+ cp = njs_utf16_decode(&ctx, (const u_char **) &start, start + length);
+ if (cp > NJS_UNICODE_MAX_CODEPOINT) {
+ njs_printf("utf16 test decode failed\n");
+ return NJS_ERROR;
+ }
+
+ if (cp != i) {
+ njs_printf("utf16 test decode code point does not match\n");
+ return NJS_ERROR;
+ }
+
+ start = to;
+
+ length_to = njs_utf16_encode(cp, &start, end_to);
+ if (length_to < NJS_OK) {
+ njs_printf("utf16 test encode failed\n");
+ return NJS_ERROR;
+ }
+
+ if (length_to != length || njs_strncmp(buf, to, length) != 0) {
+ njs_printf("utf16 test decode-encode failed\n");
+ return NJS_ERROR;
+ }
+ }
+
+ /* Surrogate pair. */
+
+ for (i = 0xD800; i <= 0xDFFF; i++) {
+ start = buf;
+
+ length = njs_utf16_encode(i, &start, end);
+ if (length < NJS_OK) {
+ njs_printf("utf16 test surrogate pair encode lead failed\n");
+ return NJS_ERROR;
+ }
+
+ length_to = njs_utf16_encode(i - 0xD800 + 0xDC00, &start, end);
+ if (length_to < NJS_OK) {
+ njs_printf("utf16 test surrogate pair encode failed\n");
+ return NJS_ERROR;
+ }
+
+ njs_utf16_decode_init(&ctx);
+
+ start = buf;
+
+ cp = njs_utf16_decode(&ctx, (const u_char **) &start,
+ start + length + length_to);
+ if (cp > NJS_UNICODE_MAX_CODEPOINT) {
+ if (i < 0xDC00) {
+ njs_printf("utf16 test surrogate pair decode failed\n");
+ return NJS_ERROR;
+ }
+ }
+ }
+
+ njs_printf("utf16 test passed\n");
+
return NJS_OK;
}
int
main(int argc, char **argv)
{
+ njs_int_t ret;
njs_uint_t start;
+ njs_printf("unicode unit test started\n");
+
if (argc > 1 && argv[1][0] == 'a') {
start = NJS_UTF8_START_TEST;
start = 256;
}
- return utf8_unit_test(start);
+ ret = utf8_unit_test(start);
+ if (ret != NJS_OK) {
+ return ret;
+ }
+
+ ret = utf16_unit_test();
+ if (ret != NJS_OK) {
+ return ret;
+ }
+
+ njs_printf("unicode unit test passed\n");
+
+ return 0;
}