1 files changed, 363 insertions, 583 deletions
diff --git a/src/utf.c b/src/utf.c
index d257f3f48..98e13abf4 100644
--- a/src/utf.c
+++ b/src/utf.c
@@ -12,7 +12,7 @@
 ** This file contains routines used to translate between UTF-8, 
 ** UTF-16, UTF-16BE, and UTF-16LE.
 **
-** $Id: utf.c,v 1.20 2004/06/17 05:36:44 danielk1977 Exp $
+** $Id: utf.c,v 1.21 2004/06/18 04:24:55 danielk1977 Exp $
 **
 ** Notes on UTF-8:
 **
@@ -48,31 +48,19 @@
 ** When converting malformed UTF-16 strings to UTF-8, one instance of the
 ** replacement character U+FFFD for each pair of bytes that cannot be
 ** interpeted as part of a valid unicode character.
+**
+** This file contains the following public routines:
+**
+** sqlite3VdbeMemTranslate() - Translate the encoding used by a Mem* string.
+** sqlite3VdbeMemHandleBom() - Handle byte-order-marks in UTF16 Mem* strings.
+** sqlite3utf16ByteLen()     - Calculate byte-length of a void* UTF16 string.
+** sqlite3utf8CharLen()      - Calculate char-length of a char* UTF8 string.
+** sqlite3utf8LikeCompare()  - Do a LIKE match given two UTF8 char* strings.
+**
 */
 #include <assert.h>
 #include "sqliteInt.h"
-
-typedef struct UtfString UtfString;
-struct UtfString {
-  unsigned char *pZ;    /* Raw string data */
-  int n;                /* Allocated length of pZ in bytes */
-  int c;                /* Number of pZ bytes already read or written */
-};
-
-/*
-** These two macros are used to interpret the first two bytes of the 
-** unsigned char array pZ as a 16-bit unsigned int. BE16() for a big-endian
-** interpretation, LE16() for little-endian.
-*/
-#define BE16(pZ) (((u16)((pZ)[0])<<8) + (u16)((pZ)[1]))
-#define LE16(pZ) (((u16)((pZ)[1])<<8) + (u16)((pZ)[0]))
-
-/*
-** READ_16 interprets the first two bytes of the unsigned char array pZ 
-** as a 16-bit unsigned int. If big_endian is non-zero the intepretation
-** is big-endian, otherwise little-endian.
-*/
-#define READ_16(pZ,big_endian) (big_endian?BE16(pZ):LE16(pZ))
+#include "vdbeInt.h"
 
 /*
 ** The following macro, LOWERCASE(x), takes an integer representing a
@@ -96,353 +84,317 @@ static unsigned char UpperToLower[91] = {
 };
 
 /*
-** The first parameter, zStr, points at a unicode string. This routine
-** reads a single character from the string and returns the codepoint value
-** of the character read.
-**
-** The value of *pEnc is the string encoding. If *pEnc is SQLITE_UTF16LE or
-** SQLITE_UTF16BE, and the first character read is a byte-order-mark, then
-** the value of *pEnc is modified if necessary. In this case the next
-** character is read and it's code-point value returned.
-**
-** The value of *pOffset is the byte-offset in zStr from which to begin
-** reading. It is incremented by the number of bytes read by this function.
-**
-** If the fourth parameter, fold, is non-zero, then codepoint values are
-** folded to lower-case before being returned. See comments for macro
-** LOWERCASE(x) for details.
+** This table maps from the first byte of a UTF-8 character to the number
+** of trailing bytes expected. A value '255' indicates that the table key
+** is not a legal first byte for a UTF-8 character.
 */
-int sqlite3ReadUniChar(const char *zStr, int *pOffset, u8 *pEnc, int fold){
-  int ret = 0;
-
-  switch( *pEnc ){
-    case SQLITE_UTF8: {
-
-#if 0
-  static const int initVal[] = {
-      0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,  14,
-     15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,
-     30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,
-     45,  46,  47,  48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,
-     60,  61,  62,  63,  64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,
-     75,  76,  77,  78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,
-     90,  91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103, 104,
-    105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
-    120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134,
-    135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149,
-    150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164,
-    165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179,
-    180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191,   0,   1,   2,
-      3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,  14,  15,  16,  17,
-     18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,  30,  31,   0,
-      1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,  14,  15,
-      0,   1,   2,   3,   4,   5,   6,   7,   0,   1,   2,   3,   0,   1, 254,
-    255,
-  };
-  ret = initVal[(unsigned char)zStr[(*pOffset)++]];
-  while( (0xc0&zStr[*pOffset])==0x80 ){
-    ret = (ret<<6) | (0x3f&(zStr[(*pOffset)++]));
-  }
-#endif
-
-      struct Utf8TblRow {
-        u8 b1_mask;
-        u8 b1_masked_val;
-        u8 b1_value_mask;
-        int trailing_bytes;
-      };
-      static const struct Utf8TblRow utf8tbl[] = {
-        { 0x80, 0x00, 0x7F, 0 },
-        { 0xE0, 0xC0, 0x1F, 1 },
-        { 0xF0, 0xE0, 0x0F, 2 },
-        { 0xF8, 0xF0, 0x0E, 3 },
-        { 0, 0, 0, 0}
-      };
-    
-      u8 b1;   /* First byte of the potentially multi-byte utf-8 character */
-      int ii;
-      struct Utf8TblRow const *pRow;
-    
-      pRow = &(utf8tbl[0]);
-    
-      b1 = zStr[(*pOffset)++];
-      while( pRow->b1_mask && (b1&pRow->b1_mask)!=pRow->b1_masked_val ){
-        pRow++;
-      }
-      if( !pRow->b1_mask ){
-        return (int)0xFFFD;
-      }
-      
-      ret = (u32)(b1&pRow->b1_value_mask);
-      for( ii=0; ii<pRow->trailing_bytes; ii++ ){
-        u8 b = zStr[(*pOffset)++];
-        if( (b&0xC0)!=0x80 ){
-          return (int)0xFFFD;
-        }
-        ret = (ret<<6) + (u32)(b&0x3F);
-      }
-      break;
-    }
-
-    case SQLITE_UTF16LE:
-    case SQLITE_UTF16BE: {
-      u32 code_point;   /* the first code-point in the character */
-      u32 code_point2;  /* the second code-point in the character, if any */
-    
-      code_point = READ_16(&zStr[*pOffset], (*pEnc==SQLITE_UTF16BE));
-      *pOffset += 2;
-    
-      /* If this is a non-surrogate code-point, just cast it to an int and
-      ** this is the code-point value.
-      */
-      if( code_point<0xD800 || code_point>0xE000 ){
-        ret = code_point;
-        break;
-      }
-
-      /* If this is a trailing surrogate code-point, then the string is
-      ** malformed; return the replacement character.
-      */
-      if( code_point>0xDBFF ){
-        return (int)0xFFFD;
-      }
-    
-      /* The code-point just read is a leading surrogate code-point. If their
-      ** is not enough data left or the next code-point is not a trailing
-      ** surrogate, return the replacement character.
-      */
-      code_point2 = READ_16(&zStr[*pOffset], (*pEnc==SQLITE_UTF16BE));
-      *pOffset += 2;
-      if( code_point2<0xDC00 || code_point>0xDFFF ){
-        return (int)0xFFFD;
-      }
-   
-      ret = ( 
-          (((code_point&0x03C0)+0x0040)<<16) +   /* uuuuu */
-          ((code_point&0x003F)<<10) +            /* xxxxxx */
-          (code_point2&0x03FF)                   /* yy yyyyyyyy */
-      );
-    }
-    default:
-      assert(0);
-  }
-
-  if( fold ){
-    return LOWERCASE(ret);
-  }
-  return ret;
-}
+static const u8 xtra_utf8_bytes[256]  = {
+/* 0xxxxxxx */
+0, 0, 0, 0, 0, 0, 0, 0,     0, 0, 0, 0, 0, 0, 0, 0,
+0, 0, 0, 0, 0, 0, 0, 0,     0, 0, 0, 0, 0, 0, 0, 0,
+0, 0, 0, 0, 0, 0, 0, 0,     0, 0, 0, 0, 0, 0, 0, 0,
+0, 0, 0, 0, 0, 0, 0, 0,     0, 0, 0, 0, 0, 0, 0, 0,
+0, 0, 0, 0, 0, 0, 0, 0,     0, 0, 0, 0, 0, 0, 0, 0,
+0, 0, 0, 0, 0, 0, 0, 0,     0, 0, 0, 0, 0, 0, 0, 0,
+0, 0, 0, 0, 0, 0, 0, 0,     0, 0, 0, 0, 0, 0, 0, 0,
+0, 0, 0, 0, 0, 0, 0, 0,     0, 0, 0, 0, 0, 0, 0, 0,
+
+/* 10wwwwww */
+255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+
+/* 110yyyyy */
+1, 1, 1, 1, 1, 1, 1, 1,     1, 1, 1, 1, 1, 1, 1, 1,
+1, 1, 1, 1, 1, 1, 1, 1,     1, 1, 1, 1, 1, 1, 1, 1,
+
+/* 1110zzzz */
+2, 2, 2, 2, 2, 2, 2, 2,     2, 2, 2, 2, 2, 2, 2, 2,
+
+/* 11110yyy */
+3, 3, 3, 3, 3, 3, 3, 3,     255, 255, 255, 255, 255, 255, 255, 255,
+};
 
 /*
-** Read the BOM from the start of *pStr, if one is present. Return zero
-** for little-endian, non-zero for big-endian. If no BOM is present, return
-** the value of the parameter "big_endian".
-**
-** Return values:
-**     1 -> big-endian string
-**     0 -> little-endian string
+** This table maps from the number of trailing bytes in a UTF-8 character
+** to an integer constant that is effectively calculated for each character
+** read by a naive implementation of a UTF-8 character reader. The code
+** in the READ_UTF8 macro explains things best.
 */
-static int readUtf16Bom(UtfString *pStr, int big_endian){
-  /* The BOM must be the first thing read from the string */
-  assert( pStr->c==0 );
-
-  /* If the string data consists of 1 byte or less, the BOM will make no
-  ** difference anyway. In this case just fall through to the default case
-  ** and return the native byte-order for this machine.
-  **
-  ** Otherwise, check the first 2 bytes of the string to see if a BOM is
-  ** present.
-  */
-  if( pStr->n>1 ){
-    u8 bom = sqlite3UtfReadBom(pStr->pZ, 2);
-    if( bom ){
-      pStr->c += 2;
-      return (bom==SQLITE_UTF16LE)?0:1;
-    }
-  }
+static const int xtra_utf8_bits[4] =  {
+0,
+12416,          /* (0xC0 << 6) + (0x80) */
+925824,         /* (0xE0 << 12) + (0x80 << 6) + (0x80) */
+63447168        /* (0xF0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */
+};
 
-  return big_endian;
+#define READ_UTF8(zIn, c) { \
+  int xtra;                                            \
+  c = *(zIn)++;                                        \
+  xtra = xtra_utf8_bytes[c];                           \
+  switch( xtra ){                                      \
+    case 255: c = (int)0xFFFD; break;                  \
+    case 3: c = (c<<6) + *(zIn)++;                     \
+    case 2: c = (c<<6) + *(zIn)++;                     \
+    case 1: c = (c<<6) + *(zIn)++;                     \
+    c -= xtra_utf8_bits[xtra];                         \
+  }                                                    \
 }
 
-/*
-** zData is a UTF-16 encoded string, nData bytes in length. This routine
-** checks if there is a byte-order mark at the start of zData. If no
-** byte order mark is found 0 is returned. Otherwise SQLITE_UTF16BE or
-** SQLITE_UTF16LE is returned, depending on whether The BOM indicates that
-** the text is big-endian or little-endian.
-*/
-u8 sqlite3UtfReadBom(const void *zData, int nData){
-  if( nData<0 || nData>1 ){
-    u8 b1 = *(u8 *)zData;
-    u8 b2 = *(((u8 *)zData) + 1);
-    if( b1==0xFE && b2==0xFF ){
-      return SQLITE_UTF16BE;
-    }
-    if( b1==0xFF && b2==0xFE ){
-      return SQLITE_UTF16LE;
-    }
-  }
-  return 0;
+#define SKIP_UTF8(zIn) {                               \
+  zIn += (xtra_utf8_bytes[*(u8 *)zIn] + 1);            \
 }
 
-
-/*
-** Read a single unicode character from the UTF-8 encoded string *pStr. The
-** value returned is a unicode scalar value. In the case of malformed
-** strings, the unicode replacement character U+FFFD may be returned.
-*/
-static u32 readUtf8(UtfString *pStr){
-  u8 enc = SQLITE_UTF8;
-  return sqlite3ReadUniChar(pStr->pZ, &pStr->c, &enc, 0);
+#define WRITE_UTF8(zOut, c) {                          \
+  if( c<0x00080 ){                                     \
+    *zOut++ = (c&0xFF);                                \
+  }                                                    \
+  else if( c<0x00800 ){                                \
+    *zOut++ = 0xC0 + ((c>>6)&0x1F);                    \
+    *zOut++ = 0x80 + (c & 0x3F);                       \
+  }                                                    \
+  else if( c<0x10000 ){                                \
+    *zOut++ = 0xE0 + ((c>>12)&0x0F);                   \
+    *zOut++ = 0x80 + ((c>>6) & 0x3F);                  \
+    *zOut++ = 0x80 + (c & 0x3F);                       \
+  }else{                                               \
+    *zOut++ = 0xF0 + ((c>>18) & 0x07);                 \
+    *zOut++ = 0x80 + ((c>>12) & 0x3F);                 \
+    *zOut++ = 0x80 + ((c>>6) & 0x3F);                  \
+    *zOut++ = 0x80 + (c & 0x3F);                       \
+  }                                                    \
 }
 
-/*
-** Write the unicode character 'code' to the string pStr using UTF-8
-** encoding. SQLITE_NOMEM may be returned if sqlite3Malloc() fails.
-*/
-static int writeUtf8(UtfString *pStr, u32 code){
-  struct Utf8WriteTblRow {
-    u32 max_code;
-    int trailing_bytes;
-    u8 b1_and_mask;
-    u8 b1_or_mask;
-  };
-  static const struct Utf8WriteTblRow utf8tbl[] = {
-    {0x0000007F, 0, 0x7F, 0x00},
-    {0x000007FF, 1, 0xDF, 0xC0},
-    {0x0000FFFF, 2, 0xEF, 0xE0},
-    {0x0010FFFF, 3, 0xF7, 0xF0},
-    {0x00000000, 0, 0x00, 0x00}
-  };
-  const struct Utf8WriteTblRow *pRow = &utf8tbl[0];
-
-  while( code>pRow->max_code ){
-    assert( pRow->max_code );
-    pRow++;
-  }
+#define WRITE_UTF16LE(zOut, c) {                                \
+  if( c<=0xFFFF ){                                              \
+    *zOut++ = (c&0x00FF);                                       \
+    *zOut++ = ((c>>8)&0x00FF);                                  \
+  }else{                                                        \
+    *zOut++ = (((c>>10)&0x003F) + (((c-0x10000)>>10)&0x00C0));  \
+    *zOut++ = (0x00D8 + (((c-0x10000)>>18)&0x03));              \
+    *zOut++ = (c&0x00FF);                                       \
+    *zOut++ = (0x00DC + ((c>>8)&0x03));                         \
+  }                                                             \
+}
 
-  /* Ensure there is enough room left in the output buffer to write
-  ** this UTF-8 character. 
-  */
-  assert( (pStr->n-pStr->c)>=(pRow->trailing_bytes+1) );
+#define WRITE_UTF16BE(zOut, c) {                                \
+  if( c<=0xFFFF ){                                              \
+    *zOut++ = ((c>>8)&0x00FF);                                  \
+    *zOut++ = (c&0x00FF);                                       \
+  }else{                                                        \
+    *zOut++ = (0x00D8 + (((c-0x10000)>>18)&0x03));              \
+    *zOut++ = (((c>>10)&0x003F) + (((c-0x10000)>>10)&0x00C0));  \
+    *zOut++ = (0x00DC + ((c>>8)&0x03));                         \
+    *zOut++ = (c&0x00FF);                                       \
+  }                                                             \
+}
 
-  /* Write the UTF-8 encoded character to pStr. All cases below are
-  ** intentionally fall-through.
-  */
-  switch( pRow->trailing_bytes ){
-    case 3:
-      pStr->pZ[pStr->c+3] = (((u8)code)&0x3F)|0x80;
-      code = code>>6;
-    case 2:
-      pStr->pZ[pStr->c+2] = (((u8)code)&0x3F)|0x80;
-      code = code>>6;
-    case 1:
-      pStr->pZ[pStr->c+1] = (((u8)code)&0x3F)|0x80;
-      code = code>>6;
-    case 0:
-      pStr->pZ[pStr->c] = (((u8)code)&(pRow->b1_and_mask))|(pRow->b1_or_mask);
-  }
-  pStr->c += (pRow->trailing_bytes + 1);
+#define READ_UTF16LE(zIn, c){                                         \
+  c = (*zIn++);                                                       \
+  c += ((*zIn++)<<8);                                                 \
+  if( c>=0xD800 && c<=0xE000 ){                                       \
+    int c2 = (*zIn++);                                                \
+    c2 += ((*zIn++)<<8);                                              \
+    c = (c2&0x03FF) + ((c&0x003F)<<10) + (((c&0x03C0)+0x0040)<<10);   \
+  }                                                                   \
+}
 
-  return 0;
+#define READ_UTF16BE(zIn, c){                                         \
+  c = ((*zIn++)<<8);                                                  \
+  c += (*zIn++);                                                      \
+  if( c>=0xD800 && c<=0xE000 ){                                       \
+    int c2 = ((*zIn++)<<8);                                           \
+    c2 += (*zIn++);                                                   \
+    c = (c2&0x03FF) + ((c&0x003F)<<10) + (((c&0x03C0)+0x0040)<<10);   \
+  }                                                                   \
 }
 
 /*
-** Read a single unicode character from the UTF-16 encoded string *pStr. The
-** value returned is a unicode scalar value. In the case of malformed
-** strings, the unicode replacement character U+FFFD may be returned.
-**
-** If big_endian is true, the string is assumed to be UTF-16BE encoded.
-** Otherwise, it is UTF-16LE encoded.
-*/
-static u32 readUtf16(UtfString *pStr, int big_endian){
-  u32 code_point;   /* the first code-point in the character */
+** If the TRANSLATE_TRACE macro is defined, the value of each Mem is
+** printed on stderr on the way into and out of sqlite3VdbeMemTranslate().
+*/ 
+/* #define TRANSLATE_TRACE 1 */
 
-  /* If there is only one byte of data left in the string, return the 
-  ** replacement character.
-  */
-  if( (pStr->n-pStr->c)==1 ){
-    pStr->c++;
-    return (int)0xFFFD;
+/*
+** This routine transforms the internal text encoding used by pMem to
+** desiredEnc. It is an error if the string is already of the desired
+** encoding, or if *pMem does not contain a string value.
+*/
+int sqlite3VdbeMemTranslate(Mem *pMem, u8 desiredEnc){
+  unsigned char zShort[NBFS]; /* Temporary short output buffer */
+  int len;                    /* Maximum length of output string in bytes */
+  unsigned char *zOut;                  /* Output buffer */
+  unsigned char *zIn;                   /* Input iterator */
+  unsigned char *zTerm;                 /* End of input */
+  unsigned char *z;                     /* Output iterator */
+  int c;
+
+  assert( pMem->flags&MEM_Str );
+  assert( pMem->enc!=desiredEnc );
+  assert( pMem->enc!=0 );
+  assert( pMem->n>=0 );
+
+#ifdef TRANSLATE_TRACE
+  {
+    char zBuf[100];
+    sqlite3VdbeMemPrettyPrint(pMem, zBuf, 100);
+    fprintf(stderr, "INPUT:  %s\n", zBuf);
   }
+#endif
 
-  code_point = READ_16(&(pStr->pZ[pStr->c]), big_endian);
-  pStr->c += 2;
-
-  /* If this is a non-surrogate code-point, just cast it to an int and
-  ** return the code-point value.
+  /* If the translation is between UTF-16 little and big endian, then 
+  ** all that is required is to swap the byte order. This case is handled
+  ** differently from the others.
   */
-  if( code_point<0xD800 || code_point>0xE000 ){
-    return code_point;
+  if( pMem->enc!=SQLITE_UTF8 && desiredEnc!=SQLITE_UTF8 ){
+    u8 temp;
+    sqlite3VdbeMemMakeWriteable(pMem);
+    zIn = pMem->z;
+    zTerm = &zIn[pMem->n];
+    while( zIn<zTerm ){
+      temp = *zIn;
+      *zIn = *(zIn+1);
+      zIn++;
+      *zIn++ = temp;
+    }
+    pMem->enc = desiredEnc;
+    goto translate_out;
   }
 
-  /* If this is a trailing surrogate code-point, then the string is
-  ** malformed; return the replacement character.
+  /* Set zIn to point at the start of the input buffer and zTerm to point 1
+  ** byte past the end.
+  **
+  ** Variable zOut is set to point at the output buffer. This may be space
+  ** obtained from malloc(), or Mem.zShort, if it large enough and not in
+  ** use, or the zShort array on the stack (see above).
   */
-  if( code_point>0xDBFF ){
-    return 0xFFFD;
+  zIn = pMem->z;
+  zTerm = &zIn[pMem->n];
+  len = pMem->n*2 + 2;
+  if( len>NBFS ){
+    zOut = sqliteMallocRaw(len);
+    if( !zOut ) return SQLITE_NOMEM;
+  }else{
+    if( pMem->z==pMem->zShort ){
+      zOut = zShort;
+    }else{
+      zOut = pMem->zShort;
+    }
   }
-
-  /* The code-point just read is a leading surrogate code-point. If their
-  ** is not enough data left or the next code-point is not a trailing
-  ** surrogate, return the replacement character.
-  */
-  if( (pStr->n-pStr->c)>1 ){
-    u32 code_point2 = READ_16(&pStr->pZ[pStr->c], big_endian);
-    if( code_point2<0xDC00 || code_point>0xDFFF ){
-      return 0xFFFD;
+  z = zOut;
+
+  if( pMem->enc==SQLITE_UTF8 ){
+    if( desiredEnc==SQLITE_UTF16LE ){
+      /* UTF-8 -> UTF-16 Little-endian */
+      while( zIn<zTerm ){
+        READ_UTF8(zIn, c); 
+        WRITE_UTF16LE(z, c);
+      }
+      WRITE_UTF16LE(z, 0);
+      pMem->n = (z-zOut)-2;
+    }else if( desiredEnc==SQLITE_UTF16BE ){
+      /* UTF-8 -> UTF-16 Big-endian */
+      while( zIn<zTerm ){
+        READ_UTF8(zIn, c); 
+        WRITE_UTF16BE(z, c);
+      }
+      WRITE_UTF16BE(z, 0);
+      pMem->n = (z-zOut)-2;
     }
-    pStr->c += 2;
-
-    return ( 
-        (((code_point&0x03C0)+0x0040)<<16) +   /* uuuuu */
-        ((code_point&0x003F)<<10) +            /* xxxxxx */
-        (code_point2&0x03FF)                   /* yy yyyyyyyy */
-    );
+  }else{
+    assert( desiredEnc==SQLITE_UTF8 );
+    if( pMem->enc==SQLITE_UTF16LE ){
+      /* UTF-16 Little-endian -> UTF-8 */
+      while( zIn<zTerm ){
+        READ_UTF16LE(zIn, c); 
+        WRITE_UTF8(z, c);
+      }
+      WRITE_UTF8(z, 0);
+      pMem->n = (z-zOut)-1;
+    }else{
+      /* UTF-16 Little-endian -> UTF-8 */
+      while( zIn<zTerm ){
+        READ_UTF16BE(zIn, c); 
+        WRITE_UTF8(z, c);
+      }
+      WRITE_UTF8(z, 0);
+      pMem->n = (z-zOut)-1;
+    }
+  }
 
+  sqlite3VdbeMemRelease(pMem);
+  pMem->flags &= ~(MEM_Static|MEM_Dyn|MEM_Ephem|MEM_Short);
+  pMem->enc = desiredEnc;
+  if( (char *)zOut==pMem->zShort ){
+    pMem->flags |= (MEM_Term|MEM_Short);
+  }else if( zOut==zShort ){
+    memcpy(pMem->zShort, zOut, len);
+    zOut = pMem->zShort;
+    pMem->flags |= (MEM_Term|MEM_Short);
   }else{
-    return (int)0xFFFD;
+    pMem->flags |= (MEM_Term|MEM_Dyn);
   }
-  
-  /* not reached */
+  pMem->z = zOut;
+
+translate_out:
+#ifdef TRANSLATE_TRACE
+  {
+    char zBuf[100];
+    sqlite3VdbeMemPrettyPrint(pMem, zBuf, 100);
+    fprintf(stderr, "OUTPUT: %s\n", zBuf);
+  }
+#endif
+  return SQLITE_OK;
 }
 
-static int writeUtf16(UtfString *pStr, int code, int big_endian){
-  int bytes;
-  unsigned char *hi_byte;
-  unsigned char *lo_byte;
-
-  bytes = (code>0x0000FFFF?4:2);
-
-  /* Ensure there is enough room left in the output buffer to write
-  ** this UTF-8 character.
-  */
-  assert( (pStr->n-pStr->c)>=bytes );
-  
-  /* Initialise hi_byte and lo_byte to point at the locations into which
-  ** the MSB and LSB of the (first) 16-bit unicode code-point written for
-  ** this character.
-  */
-  hi_byte = (big_endian?&pStr->pZ[pStr->c]:&pStr->pZ[pStr->c+1]);
-  lo_byte = (big_endian?&pStr->pZ[pStr->c+1]:&pStr->pZ[pStr->c]);
+/*
+** This routine checks for a byte-order mark at the beginning of the 
+** UTF-16 string stored in *pMem. If one is present, it is removed and
+** the encoding of the Mem adjusted. This routine does not do any
+** byte-swapping, it just sets Mem.enc appropriately.
+**
+** The allocation (static, dynamic etc.) and encoding of the Mem may be
+** changed by this function.
+*/
+int sqlite3VdbeMemHandleBom(Mem *pMem){
+  int rc = SQLITE_OK;
+  u8 bom = 0;
 
-  if( bytes==2 ){
-    *hi_byte = (u8)((code&0x0000FF00)>>8);
-    *lo_byte = (u8)(code&0x000000FF);
-  }else{
-    u32 wrd;
-    wrd = ((((code&0x001F0000)-0x00010000)+(code&0x0000FC00))>>10)|0x0000D800;
-    *hi_byte = (u8)((wrd&0x0000FF00)>>8);
-    *lo_byte = (u8)(wrd&0x000000FF);
-
-    wrd = (code&0x000003FF)|0x0000DC00;
-    *(hi_byte+2) = (u8)((wrd&0x0000FF00)>>8);
-    *(lo_byte+2) = (u8)(wrd&0x000000FF);
+  if( pMem->n<0 || pMem->n>1 ){
+    u8 b1 = *(u8 *)pMem->z;
+    u8 b2 = *(((u8 *)pMem->z) + 1);
+    if( b1==0xFE && b2==0xFF ){
+      bom = SQLITE_UTF16BE;
+    }
+    if( b1==0xFF && b2==0xFE ){
+      bom = SQLITE_UTF16LE;
+    }
   }
-
-  pStr->c += bytes;
   
-  return 0;
+  if( bom ){
+    if( pMem->flags & MEM_Short ){
+      memmove(pMem->zShort, &pMem->zShort[2], NBFS-2);
+      pMem->n -= 2;
+      pMem->enc = bom;
+    }
+    else if( pMem->flags & MEM_Dyn ){
+      void (*xDel)(void*) = pMem->xDel;
+      char *z = pMem->z;
+      pMem->z = 0;
+      pMem->xDel = 0;
+      rc = sqlite3VdbeMemSetStr(pMem, &z[2], pMem->n-2, bom, SQLITE_TRANSIENT);
+      if( xDel ){
+        xDel(z);
+      }else{
+        sqliteFree(z);
+      }
+    }else{
+      rc = sqlite3VdbeMemSetStr(pMem, &pMem->z[2], pMem->n-2, bom, 
+          SQLITE_TRANSIENT);
+    }
+  }
+  return rc;
 }
 
 /*
@@ -452,22 +404,20 @@ static int writeUtf16(UtfString *pStr, int code, int big_endian){
 ** number of unicode characters in the first nByte of pZ (or up to 
 ** the first 0x00, whichever comes first).
 */
-int sqlite3utf8CharLen(const char *pZ, int nByte){
-  UtfString str;
-  int ret = 0;
-  u32 code = 1;
-
-  str.pZ = (char *)pZ;
-  str.n = nByte;
-  str.c = 0;
-
-  while( (nByte<0 || str.c<str.n) && code!=0 ){
-    code = readUtf8(&str);
-    ret++;
+int sqlite3utf8CharLen(const char *z, int nByte){
+  int r = 0;
+  const char *zTerm;
+  if( nByte>0 ){
+    zTerm = &z[nByte];
+  }else{
+    zTerm = (const char *)(-1);
   }
-  if( code==0 ) ret--;
-
-  return ret;
+  assert( z<=zTerm );
+  while( *z!=0 && z<zTerm ){
+    SKIP_UTF8(z);
+    r++;
+  }
+  return r;
 }
 
 /*
@@ -477,242 +427,25 @@ int sqlite3utf8CharLen(const char *pZ, int nByte){
 ** then return the number of bytes in the first nChar unicode characters
 ** in pZ (or up until the first pair of 0x00 bytes, whichever comes first).
 */
-int sqlite3utf16ByteLen(const void *pZ, int nChar){
-  if( nChar<0 ){
-    const unsigned char *pC1 = (unsigned char *)pZ;
-    const unsigned char *pC2 = (unsigned char *)pZ+1;
-    while( *pC1 || *pC2 ){
-      pC1 += 2;
-      pC2 += 2;
+int sqlite3utf16ByteLen(const void *zIn, int nChar){
+  int c = 1;
+  char const *z = zIn;
+  int n = 0;
+  if( SQLITE_UTF16NATIVE==SQLITE_UTF16BE ){
+    while( c && ((nChar<0) || n<nChar) ){
+      READ_UTF16BE(z, c);
+      n++;
     }
-    return pC1-(unsigned char *)pZ;
   }else{
-    UtfString str;
-    u32 code = 1;
-    int big_endian;
-    int nRead = 0;
-    int ret;
-
-    str.pZ = (char *)pZ;
-    str.c = 0;
-    str.n = -1;
-
-    /* Check for a BOM. We just ignore it if there is one, it's only read
-    ** so that it is not counted as a character. 
-    */
-    big_endian = readUtf16Bom(&str, 0);
-    ret = 0-str.c;
-
-    while( code!=0 && nRead<nChar ){
-      code = readUtf16(&str, big_endian);
-      nRead++;
+    while( c && ((nChar<0) || n<nChar) ){
+      READ_UTF16LE(z, c);
+      n++;
     }
-    if( code==0 ){
-      ret -= 2;
-    }
-    return str.c + ret;
   }
+  return (z-(char const *)zIn)-((c==0)?2:0);
 }
 
 /*
-** Convert a string in UTF-16 native byte (or with a Byte-order-mark or
-** "BOM") into a UTF-8 string.  The UTF-8 string is written into space 
-** obtained from sqlite3Malloc() and must be released by the calling function.
-**
-** The parameter N is the number of bytes in the UTF-16 string.  If N is
-** negative, the entire string up to the first \u0000 character is translated.
-**
-** The returned UTF-8 string is always \000 terminated.
-*/
-unsigned char *sqlite3utf16to8(const void *pData, int N, int big_endian){
-  UtfString in;
-  UtfString out;
-
-  out.pZ = 0;
-
-  in.pZ = (unsigned char *)pData;
-  in.n = N;
-  in.c = 0;
-
-  if( in.n<0 ){
-    in.n = sqlite3utf16ByteLen(in.pZ, -1);
-  }
-
-  /* A UTF-8 encoding of a unicode string can require at most 1.5 times as
-  ** much space to store as the same string encoded using UTF-16. Allocate
-  ** this now.
-  */
-  out.n = (in.n*1.5) + 1;
-  out.pZ = sqliteMalloc(out.n);
-  if( !out.pZ ){
-    return 0;
-  }
-  out.c = 0;
-
-  big_endian = readUtf16Bom(&in, big_endian);
-  while( in.c<in.n ){
-    writeUtf8(&out, readUtf16(&in, big_endian));
-  }
-
-  /* Add the NULL-terminator character */
-  assert( out.c<out.n );
-  out.pZ[out.c] = 0x00;
-
-  return out.pZ;
-}
-
-static void *utf8toUtf16(const unsigned char *pIn, int N, int big_endian){
-  UtfString in;
-  UtfString out;
-
-  in.pZ = (unsigned char *)pIn;
-  in.n = N;
-  in.c = 0;
-
-  if( in.n<0 ){
-    in.n = strlen(in.pZ);
-  }
-
-  /* A UTF-16 encoding of a unicode string can require at most twice as
-  ** much space to store as the same string encoded using UTF-8. Allocate
-  ** this now.
-  */
-  out.n = (in.n*2) + 2;
-  out.pZ = sqliteMalloc(out.n);
-  if( !out.pZ ){
-    return 0;
-  }
-  out.c = 0;
-
-  while( in.c<in.n ){
-    writeUtf16(&out, readUtf8(&in), big_endian);
-  }
-
-  /* Add the NULL-terminator character */
-  assert( (out.c+1)<out.n );
-  out.pZ[out.c] = 0x00;
-  out.pZ[out.c+1] = 0x00;
-
-  return out.pZ;
-}
-
-/*
-** Translate UTF-8 to UTF-16BE or UTF-16LE
-*/
-void *sqlite3utf8to16be(const unsigned char *pIn, int N){
-  return utf8toUtf16(pIn, N, 1);
-}
-
-void *sqlite3utf8to16le(const unsigned char *pIn, int N){
-  return utf8toUtf16(pIn, N, 0);
-}
-
-/* 
-** This routine does the work for sqlite3utf16to16le() and
-** sqlite3utf16to16be(). If big_endian is 1 the input string is
-** transformed in place to UTF-16BE encoding. If big_endian is 0 then
-** the input is transformed to UTF-16LE.
-**
-** Unless the first two bytes of the input string is a BOM, the input is
-** assumed to be UTF-16 encoded using the machines native byte ordering.
-*/
-static void utf16to16(void *pData, int N, int big_endian){
-  UtfString inout;
-  inout.pZ = (unsigned char *)pData;
-  inout.c = 0;
-  inout.n = N;
-
-  if( inout.n<0 ){
-    inout.n = sqlite3utf16ByteLen(inout.pZ, -1);
-  }
-
-  if( readUtf16Bom(&inout, SQLITE_BIGENDIAN)!=big_endian ){
-    /* swab(&inout.pZ[inout.c], inout.pZ, inout.n-inout.c); */
-    int i;
-    for(i=0; i<(inout.n-inout.c); i += 2){
-      char c1 = inout.pZ[i+inout.c];
-      char c2 = inout.pZ[i+inout.c+1];
-      inout.pZ[i] = c2;
-      inout.pZ[i+1] = c1;
-    }
-  }else if( inout.c ){
-    memmove(inout.pZ, &inout.pZ[inout.c], inout.n-inout.c);
-  }
-
-  inout.pZ[inout.n-inout.c] = 0x00;
-  inout.pZ[inout.n-inout.c+1] = 0x00;
-}
-
-/*
-** Convert a string in UTF-16 native byte or with a BOM into a UTF-16LE
-** string.  The conversion occurs in-place.  The output overwrites the
-** input.  N bytes are converted.  If N is negative everything is converted
-** up to the first \u0000 character.
-**
-** If the native byte order is little-endian and there is no BOM, then
-** this routine is a no-op.  If there is a BOM at the start of the string,
-** it is removed.
-**
-** Translation from UTF-16LE to UTF-16BE and back again is accomplished
-** using the library function swab().
-*/
-void sqlite3utf16to16le(void *pData, int N){
-  utf16to16(pData, N, 0);
-}
-
-/*
-** Convert a string in UTF-16 native byte or with a BOM into a UTF-16BE
-** string.  The conversion occurs in-place.  The output overwrites the
-** input.  N bytes are converted.  If N is negative everything is converted
-** up to the first \u0000 character.
-**
-** If the native byte order is little-endian and there is no BOM, then
-** this routine is a no-op.  If there is a BOM at the start of the string,
-** it is removed.
-**
-** Translation from UTF-16LE to UTF-16BE and back again is accomplished
-** using the library function swab().
-*/
-void sqlite3utf16to16be(void *pData, int N){
-  utf16to16(pData, N, 1);
-}
-
-/*
-** This function is used to translate between UTF-8 and UTF-16. The
-** result is returned in dynamically allocated memory.
-*/
-int sqlite3utfTranslate(
-  const void *zData, int nData,  /* Input string */
-  u8 enc1,                       /* Encoding of zData */
-  void **zOut, int *nOut,        /* Output string */
-  u8 enc2                        /* Desired encoding of output */
-){
-  assert( enc1==SQLITE_UTF8 || enc1==SQLITE_UTF16LE || enc1==SQLITE_UTF16BE );
-  assert( enc2==SQLITE_UTF8 || enc2==SQLITE_UTF16LE || enc2==SQLITE_UTF16BE );
-  assert( 
-    (enc1==SQLITE_UTF8 && (enc2==SQLITE_UTF16LE || enc2==SQLITE_UTF16BE)) ||
-    (enc2==SQLITE_UTF8 && (enc1==SQLITE_UTF16LE || enc1==SQLITE_UTF16BE))
-  );
-
-  if( enc1==SQLITE_UTF8 ){
-    if( enc2==SQLITE_UTF16LE ){
-      *zOut = sqlite3utf8to16le(zData, nData);
-    }else{
-      *zOut = sqlite3utf8to16be(zData, nData);
-    }
-    if( !(*zOut) ) return SQLITE_NOMEM;
-    *nOut = sqlite3utf16ByteLen(*zOut, -1);
-  }else{
-    *zOut = sqlite3utf16to8(zData, nData, enc1==SQLITE_UTF16BE);
-    if( !(*zOut) ) return SQLITE_NOMEM;
-    *nOut = strlen(*zOut);
-  }
-  return SQLITE_OK;
-}
-
-#define sqliteNextChar(X)  while( (0xc0&*++(X))==0x80 ){}
-
-/*
 ** Compare two UTF-8 strings for equality using the "LIKE" operator of
 ** SQL.  The '%' character matches any sequence of 0 or more
 ** characters and '_' matches any single character.  Case is
@@ -731,7 +464,7 @@ int sqlite3utf8LikeCompare(
         while( (c=zPattern[1]) == '%' || c == '_' ){
           if( c=='_' ){
             if( *zString==0 ) return 0;
-            sqliteNextChar(zString);
+            SKIP_UTF8(zString);
           }
           zPattern++;
         }
@@ -744,13 +477,13 @@ int sqlite3utf8LikeCompare(
           }
           if( c2==0 ) return 0;
           if( sqlite3utf8LikeCompare(&zPattern[1],zString) ) return 1;
-          sqliteNextChar(zString);
+          SKIP_UTF8(zString);
         }
         return 0;
       }
       case '_': {
         if( *zString==0 ) return 0;
-        sqliteNextChar(zString);
+        SKIP_UTF8(zString);
         zPattern++;
         break;
       }
@@ -764,3 +497,50 @@ int sqlite3utf8LikeCompare(
   }
   return *zString==0;
 }
+
+#ifndef NDEBUG
+/*
+** This routine is called from the TCL test function "translate_selftest".
+** It checks that the primitives for serializing and deserializing
+** characters in each encoding are inverses of each other.
+*/
+void sqlite3utfSelfTest(){
+  int i;
+  unsigned char zBuf[20];
+  unsigned char *z;
+  int n;
+  int c;
+
+  for(i=0; 0 && i<0x00110000; i++){
+    z = zBuf;
+    WRITE_UTF8(z, i);
+    n = z-zBuf;
+    z = zBuf;
+    READ_UTF8(z, c);
+    assert( c==i );
+    assert( (z-zBuf)==n );
+  }
+  for(i=0; i<0x00110000; i++){
+    if( i>=0xD800 && i<=0xE000 ) continue;
+    z = zBuf;
+    WRITE_UTF16LE(z, i);
+    n = z-zBuf;
+    z = zBuf;
+    READ_UTF16LE(z, c);
+    assert( c==i );
+    assert( (z-zBuf)==n );
+  }
+  for(i=0; i<0x00110000; i++){
+    if( i>=0xD800 && i<=0xE000 ) continue;
+    z = zBuf;
+    WRITE_UTF16BE(z, i);
+    n = z-zBuf;
+    z = zBuf;
+    READ_UTF16BE(z, c);
+    assert( c==i );
+    assert( (z-zBuf)==n );
+  }
+}
+#endif
+
+