1 files changed, 246 insertions, 0 deletions
diff --git a/ext/fts3/fts3_unicode.c b/ext/fts3/fts3_unicode.c
new file mode 100644
index 000000000..83b1c322b
--- /dev/null
+++ b/ext/fts3/fts3_unicode.c
@@ -0,0 +1,246 @@
+/*
+** 2012 May 24
+**
+** The author disclaims copyright to this source code.  In place of
+** a legal notice, here is a blessing:
+**
+**    May you do good and not evil.
+**    May you find forgiveness for yourself and forgive others.
+**    May you share freely, never taking more than you give.
+**
+******************************************************************************
+**
+** Implementation of the "unicode" full-text-search tokenizer.
+*/
+
+#ifndef SQLITE_DISABLE_FTS3_UNICODE
+
+#include "fts3Int.h"
+#if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3)
+
+#include <assert.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "fts3_tokenizer.h"
+
+/*
+** The following two macros - READ_UTF8 and WRITE_UTF8 - have been copied
+** from the sqlite3 source file utf.c. If this file is compiled as part
+** of the amalgamation, they are not required.
+*/
+#ifndef SQLITE_AMALGAMATION
+
+static const unsigned char sqlite3Utf8Trans1[] = {
+  0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+  0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
+  0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
+  0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
+  0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+  0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
+  0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+  0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x00, 0x00,
+};
+
+#define READ_UTF8(zIn, zTerm, c)                           \
+  c = *(zIn++);                                            \
+  if( c>=0xc0 ){                                           \
+    c = sqlite3Utf8Trans1[c-0xc0];                         \
+    while( zIn!=zTerm && (*zIn & 0xc0)==0x80 ){            \
+      c = (c<<6) + (0x3f & *(zIn++));                      \
+    }                                                      \
+    if( c<0x80                                             \
+        || (c&0xFFFFF800)==0xD800                          \
+        || (c&0xFFFFFFFE)==0xFFFE ){  c = 0xFFFD; }        \
+  }
+
+#define WRITE_UTF8(zOut, c) {                          \
+  if( c<0x00080 ){                                     \
+    *zOut++ = (u8)(c&0xFF);                            \
+  }                                                    \
+  else if( c<0x00800 ){                                \
+    *zOut++ = 0xC0 + (u8)((c>>6)&0x1F);                \
+    *zOut++ = 0x80 + (u8)(c & 0x3F);                   \
+  }                                                    \
+  else if( c<0x10000 ){                                \
+    *zOut++ = 0xE0 + (u8)((c>>12)&0x0F);               \
+    *zOut++ = 0x80 + (u8)((c>>6) & 0x3F);              \
+    *zOut++ = 0x80 + (u8)(c & 0x3F);                   \
+  }else{                                               \
+    *zOut++ = 0xF0 + (u8)((c>>18) & 0x07);             \
+    *zOut++ = 0x80 + (u8)((c>>12) & 0x3F);             \
+    *zOut++ = 0x80 + (u8)((c>>6) & 0x3F);              \
+    *zOut++ = 0x80 + (u8)(c & 0x3F);                   \
+  }                                                    \
+}
+
+#endif /* ifndef SQLITE_AMALGAMATION */
+
+typedef struct unicode_tokenizer unicode_tokenizer;
+typedef struct unicode_cursor unicode_cursor;
+
+struct unicode_tokenizer {
+  sqlite3_tokenizer base;
+};
+
+struct unicode_cursor {
+  sqlite3_tokenizer_cursor base;
+  const unsigned char *aInput;    /* Input text being tokenized */
+  int nInput;                     /* Size of aInput[] in bytes */
+  int iOff;                       /* Current offset within aInput[] */
+  int iToken;                     /* Index of next token to be returned */
+  char *zToken;                   /* storage for current token */
+  int nAlloc;                     /* space allocated at zToken */
+};
+
+/*
+** Create a new tokenizer instance.
+*/
+static int unicodeCreate(
+  int nArg,                       /* Size of array argv[] */
+  const char * const *azArg,      /* Tokenizer creation arguments */
+  sqlite3_tokenizer **pp          /* OUT: New tokenizer handle */
+){
+  unicode_tokenizer *pNew;        /* New tokenizer object */
+  pNew = (unicode_tokenizer *) sqlite3_malloc(sizeof(unicode_tokenizer));
+  if( pNew==NULL ){
+    return SQLITE_NOMEM;
+  }
+  memset(pNew, 0, sizeof(unicode_tokenizer));
+  *pp = &pNew->base;
+  return SQLITE_OK;
+}
+
+/*
+** Destroy a tokenizer allocated by unicodeCreate().
+*/
+static int unicodeDestroy(sqlite3_tokenizer *pTokenizer){
+  sqlite3_free(pTokenizer);
+  return SQLITE_OK;
+}
+
+/*
+** Prepare to begin tokenizing a particular string.  The input
+** string to be tokenized is pInput[0..nBytes-1].  A cursor
+** used to incrementally tokenize this string is returned in 
+** *ppCursor.
+*/
+static int unicodeOpen(
+  sqlite3_tokenizer *p,           /* The tokenizer */
+  const char *aInput,             /* Input string */
+  int nInput,                     /* Size of string aInput in bytes */
+  sqlite3_tokenizer_cursor **pp   /* OUT: New cursor object */
+){
+  unicode_cursor *pCsr;
+
+  pCsr = (unicode_cursor *)sqlite3_malloc(sizeof(unicode_cursor));
+  if( pCsr==0 ){
+    return SQLITE_NOMEM;
+  }
+  memset(pCsr, 0, sizeof(unicode_cursor));
+
+  pCsr->aInput = (const unsigned char *)aInput;
+  if( aInput==0 ){
+    pCsr->nInput = 0;
+  }else if( nInput<0 ){
+    pCsr->nInput = (int)strlen(aInput);
+  }else{
+    pCsr->nInput = nInput;
+  }
+
+  *pp = &pCsr->base;
+  UNUSED_PARAMETER(p);
+  return SQLITE_OK;
+}
+
+/*
+** Close a tokenization cursor previously opened by a call to
+** simpleOpen() above.
+*/
+static int unicodeClose(sqlite3_tokenizer_cursor *pCursor){
+  unicode_cursor *pCsr = (unicode_cursor *) pCursor;
+  sqlite3_free(pCsr->zToken);
+  sqlite3_free(pCsr);
+  return SQLITE_OK;
+}
+
+/*
+** Extract the next token from a tokenization cursor.  The cursor must
+** have been opened by a prior call to simpleOpen().
+*/
+static int unicodeNext(
+  sqlite3_tokenizer_cursor *p,    /* Cursor returned by simpleOpen */
+  const char **paToken,           /* OUT: Token text */
+  int *pnToken,                   /* OUT: Number of bytes at *paToken */
+  int *piStart,                   /* OUT: Starting offset of token */
+  int *piEnd,                     /* OUT: Ending offset of token */
+  int *piPos                      /* OUT: Position integer of token */
+){
+  unicode_cursor *pCsr = (unicode_cursor *)p;
+  int iCode;
+  char *zOut;
+  const unsigned char *z = &pCsr->aInput[pCsr->iOff];
+  const unsigned char *zStart = z;
+  const unsigned char *zEnd;
+  const unsigned char *zTerm = &pCsr->aInput[pCsr->nInput];
+
+  /* Scan past any delimiter characters before the start of the next token.
+  ** Return SQLITE_DONE early if this takes us all the way to the end of 
+  ** the input.  */
+  while( z<zTerm ){
+    READ_UTF8(z, zTerm, iCode);
+    if( sqlite3FtsUnicodeIsalnum(iCode) ) break;
+    zStart = z;
+  }
+  if( zStart>=zTerm ) return SQLITE_DONE;
+
+  zOut = pCsr->zToken;
+  do {
+    /* Grow the output buffer if required. */
+    if( (zOut-pCsr->zToken)>=(pCsr->nAlloc-4) ){
+      char *zNew = sqlite3_realloc(pCsr->zToken, pCsr->nAlloc+64);
+      if( !zNew ) return SQLITE_NOMEM;
+      zOut = &zNew[zOut - pCsr->zToken];
+      pCsr->zToken = zNew;
+      pCsr->nAlloc += 64;
+    }
+
+    /* Write the folded case of the last character read to the output */
+    zEnd = z;
+    WRITE_UTF8(zOut, sqlite3FtsUnicodeTolower(iCode));
+
+    /* If the cursor is not at EOF, read the next character */
+    if( z>=zTerm ) break;
+    READ_UTF8(z, zTerm, iCode);
+  }while( sqlite3FtsUnicodeIsalnum(iCode) );
+
+  /* Set the output variables and return. */
+  pCsr->iOff = (z - pCsr->aInput);
+  *paToken = pCsr->zToken;
+  *pnToken = zOut - pCsr->zToken;
+  *piStart = (zStart - pCsr->aInput);
+  *piEnd = (zEnd - pCsr->aInput);
+  *piPos = pCsr->iToken++;
+  return SQLITE_OK;
+}
+
+/*
+** Set *ppModule to a pointer to the sqlite3_tokenizer_module 
+** structure for the unicode tokenizer.
+*/
+void sqlite3Fts3UnicodeTokenizer(sqlite3_tokenizer_module const **ppModule){
+  static const sqlite3_tokenizer_module module = {
+    0,
+    unicodeCreate,
+    unicodeDestroy,
+    unicodeOpen,
+    unicodeClose,
+    unicodeNext,
+    0,
+  };
+  *ppModule = &module;
+}
+
+#endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3) */
+#endif /* ifndef SQLITE_DISABLE_FTS3_UNICODE */