diff options
Diffstat (limited to 'src/utf.c')
-rw-r--r-- | src/utf.c | 946 |
1 files changed, 363 insertions, 583 deletions
@@ -12,7 +12,7 @@ ** This file contains routines used to translate between UTF-8, ** UTF-16, UTF-16BE, and UTF-16LE. ** -** $Id: utf.c,v 1.20 2004/06/17 05:36:44 danielk1977 Exp $ +** $Id: utf.c,v 1.21 2004/06/18 04:24:55 danielk1977 Exp $ ** ** Notes on UTF-8: ** @@ -48,31 +48,19 @@ ** When converting malformed UTF-16 strings to UTF-8, one instance of the ** replacement character U+FFFD for each pair of bytes that cannot be ** interpeted as part of a valid unicode character. +** +** This file contains the following public routines: +** +** sqlite3VdbeMemTranslate() - Translate the encoding used by a Mem* string. +** sqlite3VdbeMemHandleBom() - Handle byte-order-marks in UTF16 Mem* strings. +** sqlite3utf16ByteLen() - Calculate byte-length of a void* UTF16 string. +** sqlite3utf8CharLen() - Calculate char-length of a char* UTF8 string. +** sqlite3utf8LikeCompare() - Do a LIKE match given two UTF8 char* strings. +** */ #include <assert.h> #include "sqliteInt.h" - -typedef struct UtfString UtfString; -struct UtfString { - unsigned char *pZ; /* Raw string data */ - int n; /* Allocated length of pZ in bytes */ - int c; /* Number of pZ bytes already read or written */ -}; - -/* -** These two macros are used to interpret the first two bytes of the -** unsigned char array pZ as a 16-bit unsigned int. BE16() for a big-endian -** interpretation, LE16() for little-endian. -*/ -#define BE16(pZ) (((u16)((pZ)[0])<<8) + (u16)((pZ)[1])) -#define LE16(pZ) (((u16)((pZ)[1])<<8) + (u16)((pZ)[0])) - -/* -** READ_16 interprets the first two bytes of the unsigned char array pZ -** as a 16-bit unsigned int. If big_endian is non-zero the intepretation -** is big-endian, otherwise little-endian. -*/ -#define READ_16(pZ,big_endian) (big_endian?BE16(pZ):LE16(pZ)) +#include "vdbeInt.h" /* ** The following macro, LOWERCASE(x), takes an integer representing a @@ -96,353 +84,317 @@ static unsigned char UpperToLower[91] = { }; /* -** The first parameter, zStr, points at a unicode string. This routine -** reads a single character from the string and returns the codepoint value -** of the character read. -** -** The value of *pEnc is the string encoding. If *pEnc is SQLITE_UTF16LE or -** SQLITE_UTF16BE, and the first character read is a byte-order-mark, then -** the value of *pEnc is modified if necessary. In this case the next -** character is read and it's code-point value returned. -** -** The value of *pOffset is the byte-offset in zStr from which to begin -** reading. It is incremented by the number of bytes read by this function. -** -** If the fourth parameter, fold, is non-zero, then codepoint values are -** folded to lower-case before being returned. See comments for macro -** LOWERCASE(x) for details. +** This table maps from the first byte of a UTF-8 character to the number +** of trailing bytes expected. A value '255' indicates that the table key +** is not a legal first byte for a UTF-8 character. */ -int sqlite3ReadUniChar(const char *zStr, int *pOffset, u8 *pEnc, int fold){ - int ret = 0; - - switch( *pEnc ){ - case SQLITE_UTF8: { - -#if 0 - static const int initVal[] = { - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, - 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, - 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, - 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, - 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, - 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, - 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, - 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, - 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, - 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, - 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, - 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, - 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 0, 1, 2, - 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, - 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, - 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, - 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 254, - 255, - }; - ret = initVal[(unsigned char)zStr[(*pOffset)++]]; - while( (0xc0&zStr[*pOffset])==0x80 ){ - ret = (ret<<6) | (0x3f&(zStr[(*pOffset)++])); - } -#endif - - struct Utf8TblRow { - u8 b1_mask; - u8 b1_masked_val; - u8 b1_value_mask; - int trailing_bytes; - }; - static const struct Utf8TblRow utf8tbl[] = { - { 0x80, 0x00, 0x7F, 0 }, - { 0xE0, 0xC0, 0x1F, 1 }, - { 0xF0, 0xE0, 0x0F, 2 }, - { 0xF8, 0xF0, 0x0E, 3 }, - { 0, 0, 0, 0} - }; - - u8 b1; /* First byte of the potentially multi-byte utf-8 character */ - int ii; - struct Utf8TblRow const *pRow; - - pRow = &(utf8tbl[0]); - - b1 = zStr[(*pOffset)++]; - while( pRow->b1_mask && (b1&pRow->b1_mask)!=pRow->b1_masked_val ){ - pRow++; - } - if( !pRow->b1_mask ){ - return (int)0xFFFD; - } - - ret = (u32)(b1&pRow->b1_value_mask); - for( ii=0; ii<pRow->trailing_bytes; ii++ ){ - u8 b = zStr[(*pOffset)++]; - if( (b&0xC0)!=0x80 ){ - return (int)0xFFFD; - } - ret = (ret<<6) + (u32)(b&0x3F); - } - break; - } - - case SQLITE_UTF16LE: - case SQLITE_UTF16BE: { - u32 code_point; /* the first code-point in the character */ - u32 code_point2; /* the second code-point in the character, if any */ - - code_point = READ_16(&zStr[*pOffset], (*pEnc==SQLITE_UTF16BE)); - *pOffset += 2; - - /* If this is a non-surrogate code-point, just cast it to an int and - ** this is the code-point value. - */ - if( code_point<0xD800 || code_point>0xE000 ){ - ret = code_point; - break; - } - - /* If this is a trailing surrogate code-point, then the string is - ** malformed; return the replacement character. - */ - if( code_point>0xDBFF ){ - return (int)0xFFFD; - } - - /* The code-point just read is a leading surrogate code-point. If their - ** is not enough data left or the next code-point is not a trailing - ** surrogate, return the replacement character. - */ - code_point2 = READ_16(&zStr[*pOffset], (*pEnc==SQLITE_UTF16BE)); - *pOffset += 2; - if( code_point2<0xDC00 || code_point>0xDFFF ){ - return (int)0xFFFD; - } - - ret = ( - (((code_point&0x03C0)+0x0040)<<16) + /* uuuuu */ - ((code_point&0x003F)<<10) + /* xxxxxx */ - (code_point2&0x03FF) /* yy yyyyyyyy */ - ); - } - default: - assert(0); - } - - if( fold ){ - return LOWERCASE(ret); - } - return ret; -} +static const u8 xtra_utf8_bytes[256] = { +/* 0xxxxxxx */ +0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + +/* 10wwwwww */ +255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, +255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, +255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, +255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + +/* 110yyyyy */ +1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, +1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + +/* 1110zzzz */ +2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + +/* 11110yyy */ +3, 3, 3, 3, 3, 3, 3, 3, 255, 255, 255, 255, 255, 255, 255, 255, +}; /* -** Read the BOM from the start of *pStr, if one is present. Return zero -** for little-endian, non-zero for big-endian. If no BOM is present, return -** the value of the parameter "big_endian". -** -** Return values: -** 1 -> big-endian string -** 0 -> little-endian string +** This table maps from the number of trailing bytes in a UTF-8 character +** to an integer constant that is effectively calculated for each character +** read by a naive implementation of a UTF-8 character reader. The code +** in the READ_UTF8 macro explains things best. */ -static int readUtf16Bom(UtfString *pStr, int big_endian){ - /* The BOM must be the first thing read from the string */ - assert( pStr->c==0 ); - - /* If the string data consists of 1 byte or less, the BOM will make no - ** difference anyway. In this case just fall through to the default case - ** and return the native byte-order for this machine. - ** - ** Otherwise, check the first 2 bytes of the string to see if a BOM is - ** present. - */ - if( pStr->n>1 ){ - u8 bom = sqlite3UtfReadBom(pStr->pZ, 2); - if( bom ){ - pStr->c += 2; - return (bom==SQLITE_UTF16LE)?0:1; - } - } +static const int xtra_utf8_bits[4] = { +0, +12416, /* (0xC0 << 6) + (0x80) */ +925824, /* (0xE0 << 12) + (0x80 << 6) + (0x80) */ +63447168 /* (0xF0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */ +}; - return big_endian; +#define READ_UTF8(zIn, c) { \ + int xtra; \ + c = *(zIn)++; \ + xtra = xtra_utf8_bytes[c]; \ + switch( xtra ){ \ + case 255: c = (int)0xFFFD; break; \ + case 3: c = (c<<6) + *(zIn)++; \ + case 2: c = (c<<6) + *(zIn)++; \ + case 1: c = (c<<6) + *(zIn)++; \ + c -= xtra_utf8_bits[xtra]; \ + } \ } -/* -** zData is a UTF-16 encoded string, nData bytes in length. This routine -** checks if there is a byte-order mark at the start of zData. If no -** byte order mark is found 0 is returned. Otherwise SQLITE_UTF16BE or -** SQLITE_UTF16LE is returned, depending on whether The BOM indicates that -** the text is big-endian or little-endian. -*/ -u8 sqlite3UtfReadBom(const void *zData, int nData){ - if( nData<0 || nData>1 ){ - u8 b1 = *(u8 *)zData; - u8 b2 = *(((u8 *)zData) + 1); - if( b1==0xFE && b2==0xFF ){ - return SQLITE_UTF16BE; - } - if( b1==0xFF && b2==0xFE ){ - return SQLITE_UTF16LE; - } - } - return 0; +#define SKIP_UTF8(zIn) { \ + zIn += (xtra_utf8_bytes[*(u8 *)zIn] + 1); \ } - -/* -** Read a single unicode character from the UTF-8 encoded string *pStr. The -** value returned is a unicode scalar value. In the case of malformed -** strings, the unicode replacement character U+FFFD may be returned. -*/ -static u32 readUtf8(UtfString *pStr){ - u8 enc = SQLITE_UTF8; - return sqlite3ReadUniChar(pStr->pZ, &pStr->c, &enc, 0); +#define WRITE_UTF8(zOut, c) { \ + if( c<0x00080 ){ \ + *zOut++ = (c&0xFF); \ + } \ + else if( c<0x00800 ){ \ + *zOut++ = 0xC0 + ((c>>6)&0x1F); \ + *zOut++ = 0x80 + (c & 0x3F); \ + } \ + else if( c<0x10000 ){ \ + *zOut++ = 0xE0 + ((c>>12)&0x0F); \ + *zOut++ = 0x80 + ((c>>6) & 0x3F); \ + *zOut++ = 0x80 + (c & 0x3F); \ + }else{ \ + *zOut++ = 0xF0 + ((c>>18) & 0x07); \ + *zOut++ = 0x80 + ((c>>12) & 0x3F); \ + *zOut++ = 0x80 + ((c>>6) & 0x3F); \ + *zOut++ = 0x80 + (c & 0x3F); \ + } \ } -/* -** Write the unicode character 'code' to the string pStr using UTF-8 -** encoding. SQLITE_NOMEM may be returned if sqlite3Malloc() fails. -*/ -static int writeUtf8(UtfString *pStr, u32 code){ - struct Utf8WriteTblRow { - u32 max_code; - int trailing_bytes; - u8 b1_and_mask; - u8 b1_or_mask; - }; - static const struct Utf8WriteTblRow utf8tbl[] = { - {0x0000007F, 0, 0x7F, 0x00}, - {0x000007FF, 1, 0xDF, 0xC0}, - {0x0000FFFF, 2, 0xEF, 0xE0}, - {0x0010FFFF, 3, 0xF7, 0xF0}, - {0x00000000, 0, 0x00, 0x00} - }; - const struct Utf8WriteTblRow *pRow = &utf8tbl[0]; - - while( code>pRow->max_code ){ - assert( pRow->max_code ); - pRow++; - } +#define WRITE_UTF16LE(zOut, c) { \ + if( c<=0xFFFF ){ \ + *zOut++ = (c&0x00FF); \ + *zOut++ = ((c>>8)&0x00FF); \ + }else{ \ + *zOut++ = (((c>>10)&0x003F) + (((c-0x10000)>>10)&0x00C0)); \ + *zOut++ = (0x00D8 + (((c-0x10000)>>18)&0x03)); \ + *zOut++ = (c&0x00FF); \ + *zOut++ = (0x00DC + ((c>>8)&0x03)); \ + } \ +} - /* Ensure there is enough room left in the output buffer to write - ** this UTF-8 character. - */ - assert( (pStr->n-pStr->c)>=(pRow->trailing_bytes+1) ); +#define WRITE_UTF16BE(zOut, c) { \ + if( c<=0xFFFF ){ \ + *zOut++ = ((c>>8)&0x00FF); \ + *zOut++ = (c&0x00FF); \ + }else{ \ + *zOut++ = (0x00D8 + (((c-0x10000)>>18)&0x03)); \ + *zOut++ = (((c>>10)&0x003F) + (((c-0x10000)>>10)&0x00C0)); \ + *zOut++ = (0x00DC + ((c>>8)&0x03)); \ + *zOut++ = (c&0x00FF); \ + } \ +} - /* Write the UTF-8 encoded character to pStr. All cases below are - ** intentionally fall-through. - */ - switch( pRow->trailing_bytes ){ - case 3: - pStr->pZ[pStr->c+3] = (((u8)code)&0x3F)|0x80; - code = code>>6; - case 2: - pStr->pZ[pStr->c+2] = (((u8)code)&0x3F)|0x80; - code = code>>6; - case 1: - pStr->pZ[pStr->c+1] = (((u8)code)&0x3F)|0x80; - code = code>>6; - case 0: - pStr->pZ[pStr->c] = (((u8)code)&(pRow->b1_and_mask))|(pRow->b1_or_mask); - } - pStr->c += (pRow->trailing_bytes + 1); +#define READ_UTF16LE(zIn, c){ \ + c = (*zIn++); \ + c += ((*zIn++)<<8); \ + if( c>=0xD800 && c<=0xE000 ){ \ + int c2 = (*zIn++); \ + c2 += ((*zIn++)<<8); \ + c = (c2&0x03FF) + ((c&0x003F)<<10) + (((c&0x03C0)+0x0040)<<10); \ + } \ +} - return 0; +#define READ_UTF16BE(zIn, c){ \ + c = ((*zIn++)<<8); \ + c += (*zIn++); \ + if( c>=0xD800 && c<=0xE000 ){ \ + int c2 = ((*zIn++)<<8); \ + c2 += (*zIn++); \ + c = (c2&0x03FF) + ((c&0x003F)<<10) + (((c&0x03C0)+0x0040)<<10); \ + } \ } /* -** Read a single unicode character from the UTF-16 encoded string *pStr. The -** value returned is a unicode scalar value. In the case of malformed -** strings, the unicode replacement character U+FFFD may be returned. -** -** If big_endian is true, the string is assumed to be UTF-16BE encoded. -** Otherwise, it is UTF-16LE encoded. -*/ -static u32 readUtf16(UtfString *pStr, int big_endian){ - u32 code_point; /* the first code-point in the character */ +** If the TRANSLATE_TRACE macro is defined, the value of each Mem is +** printed on stderr on the way into and out of sqlite3VdbeMemTranslate(). +*/ +/* #define TRANSLATE_TRACE 1 */ - /* If there is only one byte of data left in the string, return the - ** replacement character. - */ - if( (pStr->n-pStr->c)==1 ){ - pStr->c++; - return (int)0xFFFD; +/* +** This routine transforms the internal text encoding used by pMem to +** desiredEnc. It is an error if the string is already of the desired +** encoding, or if *pMem does not contain a string value. +*/ +int sqlite3VdbeMemTranslate(Mem *pMem, u8 desiredEnc){ + unsigned char zShort[NBFS]; /* Temporary short output buffer */ + int len; /* Maximum length of output string in bytes */ + unsigned char *zOut; /* Output buffer */ + unsigned char *zIn; /* Input iterator */ + unsigned char *zTerm; /* End of input */ + unsigned char *z; /* Output iterator */ + int c; + + assert( pMem->flags&MEM_Str ); + assert( pMem->enc!=desiredEnc ); + assert( pMem->enc!=0 ); + assert( pMem->n>=0 ); + +#ifdef TRANSLATE_TRACE + { + char zBuf[100]; + sqlite3VdbeMemPrettyPrint(pMem, zBuf, 100); + fprintf(stderr, "INPUT: %s\n", zBuf); } +#endif - code_point = READ_16(&(pStr->pZ[pStr->c]), big_endian); - pStr->c += 2; - - /* If this is a non-surrogate code-point, just cast it to an int and - ** return the code-point value. + /* If the translation is between UTF-16 little and big endian, then + ** all that is required is to swap the byte order. This case is handled + ** differently from the others. */ - if( code_point<0xD800 || code_point>0xE000 ){ - return code_point; + if( pMem->enc!=SQLITE_UTF8 && desiredEnc!=SQLITE_UTF8 ){ + u8 temp; + sqlite3VdbeMemMakeWriteable(pMem); + zIn = pMem->z; + zTerm = &zIn[pMem->n]; + while( zIn<zTerm ){ + temp = *zIn; + *zIn = *(zIn+1); + zIn++; + *zIn++ = temp; + } + pMem->enc = desiredEnc; + goto translate_out; } - /* If this is a trailing surrogate code-point, then the string is - ** malformed; return the replacement character. + /* Set zIn to point at the start of the input buffer and zTerm to point 1 + ** byte past the end. + ** + ** Variable zOut is set to point at the output buffer. This may be space + ** obtained from malloc(), or Mem.zShort, if it large enough and not in + ** use, or the zShort array on the stack (see above). */ - if( code_point>0xDBFF ){ - return 0xFFFD; + zIn = pMem->z; + zTerm = &zIn[pMem->n]; + len = pMem->n*2 + 2; + if( len>NBFS ){ + zOut = sqliteMallocRaw(len); + if( !zOut ) return SQLITE_NOMEM; + }else{ + if( pMem->z==pMem->zShort ){ + zOut = zShort; + }else{ + zOut = pMem->zShort; + } } - - /* The code-point just read is a leading surrogate code-point. If their - ** is not enough data left or the next code-point is not a trailing - ** surrogate, return the replacement character. - */ - if( (pStr->n-pStr->c)>1 ){ - u32 code_point2 = READ_16(&pStr->pZ[pStr->c], big_endian); - if( code_point2<0xDC00 || code_point>0xDFFF ){ - return 0xFFFD; + z = zOut; + + if( pMem->enc==SQLITE_UTF8 ){ + if( desiredEnc==SQLITE_UTF16LE ){ + /* UTF-8 -> UTF-16 Little-endian */ + while( zIn<zTerm ){ + READ_UTF8(zIn, c); + WRITE_UTF16LE(z, c); + } + WRITE_UTF16LE(z, 0); + pMem->n = (z-zOut)-2; + }else if( desiredEnc==SQLITE_UTF16BE ){ + /* UTF-8 -> UTF-16 Big-endian */ + while( zIn<zTerm ){ + READ_UTF8(zIn, c); + WRITE_UTF16BE(z, c); + } + WRITE_UTF16BE(z, 0); + pMem->n = (z-zOut)-2; } - pStr->c += 2; - - return ( - (((code_point&0x03C0)+0x0040)<<16) + /* uuuuu */ - ((code_point&0x003F)<<10) + /* xxxxxx */ - (code_point2&0x03FF) /* yy yyyyyyyy */ - ); + }else{ + assert( desiredEnc==SQLITE_UTF8 ); + if( pMem->enc==SQLITE_UTF16LE ){ + /* UTF-16 Little-endian -> UTF-8 */ + while( zIn<zTerm ){ + READ_UTF16LE(zIn, c); + WRITE_UTF8(z, c); + } + WRITE_UTF8(z, 0); + pMem->n = (z-zOut)-1; + }else{ + /* UTF-16 Little-endian -> UTF-8 */ + while( zIn<zTerm ){ + READ_UTF16BE(zIn, c); + WRITE_UTF8(z, c); + } + WRITE_UTF8(z, 0); + pMem->n = (z-zOut)-1; + } + } + sqlite3VdbeMemRelease(pMem); + pMem->flags &= ~(MEM_Static|MEM_Dyn|MEM_Ephem|MEM_Short); + pMem->enc = desiredEnc; + if( (char *)zOut==pMem->zShort ){ + pMem->flags |= (MEM_Term|MEM_Short); + }else if( zOut==zShort ){ + memcpy(pMem->zShort, zOut, len); + zOut = pMem->zShort; + pMem->flags |= (MEM_Term|MEM_Short); }else{ - return (int)0xFFFD; + pMem->flags |= (MEM_Term|MEM_Dyn); } - - /* not reached */ + pMem->z = zOut; + +translate_out: +#ifdef TRANSLATE_TRACE + { + char zBuf[100]; + sqlite3VdbeMemPrettyPrint(pMem, zBuf, 100); + fprintf(stderr, "OUTPUT: %s\n", zBuf); + } +#endif + return SQLITE_OK; } -static int writeUtf16(UtfString *pStr, int code, int big_endian){ - int bytes; - unsigned char *hi_byte; - unsigned char *lo_byte; - - bytes = (code>0x0000FFFF?4:2); - - /* Ensure there is enough room left in the output buffer to write - ** this UTF-8 character. - */ - assert( (pStr->n-pStr->c)>=bytes ); - - /* Initialise hi_byte and lo_byte to point at the locations into which - ** the MSB and LSB of the (first) 16-bit unicode code-point written for - ** this character. - */ - hi_byte = (big_endian?&pStr->pZ[pStr->c]:&pStr->pZ[pStr->c+1]); - lo_byte = (big_endian?&pStr->pZ[pStr->c+1]:&pStr->pZ[pStr->c]); +/* +** This routine checks for a byte-order mark at the beginning of the +** UTF-16 string stored in *pMem. If one is present, it is removed and +** the encoding of the Mem adjusted. This routine does not do any +** byte-swapping, it just sets Mem.enc appropriately. +** +** The allocation (static, dynamic etc.) and encoding of the Mem may be +** changed by this function. +*/ +int sqlite3VdbeMemHandleBom(Mem *pMem){ + int rc = SQLITE_OK; + u8 bom = 0; - if( bytes==2 ){ - *hi_byte = (u8)((code&0x0000FF00)>>8); - *lo_byte = (u8)(code&0x000000FF); - }else{ - u32 wrd; - wrd = ((((code&0x001F0000)-0x00010000)+(code&0x0000FC00))>>10)|0x0000D800; - *hi_byte = (u8)((wrd&0x0000FF00)>>8); - *lo_byte = (u8)(wrd&0x000000FF); - - wrd = (code&0x000003FF)|0x0000DC00; - *(hi_byte+2) = (u8)((wrd&0x0000FF00)>>8); - *(lo_byte+2) = (u8)(wrd&0x000000FF); + if( pMem->n<0 || pMem->n>1 ){ + u8 b1 = *(u8 *)pMem->z; + u8 b2 = *(((u8 *)pMem->z) + 1); + if( b1==0xFE && b2==0xFF ){ + bom = SQLITE_UTF16BE; + } + if( b1==0xFF && b2==0xFE ){ + bom = SQLITE_UTF16LE; + } } - - pStr->c += bytes; - return 0; + if( bom ){ + if( pMem->flags & MEM_Short ){ + memmove(pMem->zShort, &pMem->zShort[2], NBFS-2); + pMem->n -= 2; + pMem->enc = bom; + } + else if( pMem->flags & MEM_Dyn ){ + void (*xDel)(void*) = pMem->xDel; + char *z = pMem->z; + pMem->z = 0; + pMem->xDel = 0; + rc = sqlite3VdbeMemSetStr(pMem, &z[2], pMem->n-2, bom, SQLITE_TRANSIENT); + if( xDel ){ + xDel(z); + }else{ + sqliteFree(z); + } + }else{ + rc = sqlite3VdbeMemSetStr(pMem, &pMem->z[2], pMem->n-2, bom, + SQLITE_TRANSIENT); + } + } + return rc; } /* @@ -452,22 +404,20 @@ static int writeUtf16(UtfString *pStr, int code, int big_endian){ ** number of unicode characters in the first nByte of pZ (or up to ** the first 0x00, whichever comes first). */ -int sqlite3utf8CharLen(const char *pZ, int nByte){ - UtfString str; - int ret = 0; - u32 code = 1; - - str.pZ = (char *)pZ; - str.n = nByte; - str.c = 0; - - while( (nByte<0 || str.c<str.n) && code!=0 ){ - code = readUtf8(&str); - ret++; +int sqlite3utf8CharLen(const char *z, int nByte){ + int r = 0; + const char *zTerm; + if( nByte>0 ){ + zTerm = &z[nByte]; + }else{ + zTerm = (const char *)(-1); } - if( code==0 ) ret--; - - return ret; + assert( z<=zTerm ); + while( *z!=0 && z<zTerm ){ + SKIP_UTF8(z); + r++; + } + return r; } /* @@ -477,242 +427,25 @@ int sqlite3utf8CharLen(const char *pZ, int nByte){ ** then return the number of bytes in the first nChar unicode characters ** in pZ (or up until the first pair of 0x00 bytes, whichever comes first). */ -int sqlite3utf16ByteLen(const void *pZ, int nChar){ - if( nChar<0 ){ - const unsigned char *pC1 = (unsigned char *)pZ; - const unsigned char *pC2 = (unsigned char *)pZ+1; - while( *pC1 || *pC2 ){ - pC1 += 2; - pC2 += 2; +int sqlite3utf16ByteLen(const void *zIn, int nChar){ + int c = 1; + char const *z = zIn; + int n = 0; + if( SQLITE_UTF16NATIVE==SQLITE_UTF16BE ){ + while( c && ((nChar<0) || n<nChar) ){ + READ_UTF16BE(z, c); + n++; } - return pC1-(unsigned char *)pZ; }else{ - UtfString str; - u32 code = 1; - int big_endian; - int nRead = 0; - int ret; - - str.pZ = (char *)pZ; - str.c = 0; - str.n = -1; - - /* Check for a BOM. We just ignore it if there is one, it's only read - ** so that it is not counted as a character. - */ - big_endian = readUtf16Bom(&str, 0); - ret = 0-str.c; - - while( code!=0 && nRead<nChar ){ - code = readUtf16(&str, big_endian); - nRead++; + while( c && ((nChar<0) || n<nChar) ){ + READ_UTF16LE(z, c); + n++; } - if( code==0 ){ - ret -= 2; - } - return str.c + ret; } + return (z-(char const *)zIn)-((c==0)?2:0); } /* -** Convert a string in UTF-16 native byte (or with a Byte-order-mark or -** "BOM") into a UTF-8 string. The UTF-8 string is written into space -** obtained from sqlite3Malloc() and must be released by the calling function. -** -** The parameter N is the number of bytes in the UTF-16 string. If N is -** negative, the entire string up to the first \u0000 character is translated. -** -** The returned UTF-8 string is always \000 terminated. -*/ -unsigned char *sqlite3utf16to8(const void *pData, int N, int big_endian){ - UtfString in; - UtfString out; - - out.pZ = 0; - - in.pZ = (unsigned char *)pData; - in.n = N; - in.c = 0; - - if( in.n<0 ){ - in.n = sqlite3utf16ByteLen(in.pZ, -1); - } - - /* A UTF-8 encoding of a unicode string can require at most 1.5 times as - ** much space to store as the same string encoded using UTF-16. Allocate - ** this now. - */ - out.n = (in.n*1.5) + 1; - out.pZ = sqliteMalloc(out.n); - if( !out.pZ ){ - return 0; - } - out.c = 0; - - big_endian = readUtf16Bom(&in, big_endian); - while( in.c<in.n ){ - writeUtf8(&out, readUtf16(&in, big_endian)); - } - - /* Add the NULL-terminator character */ - assert( out.c<out.n ); - out.pZ[out.c] = 0x00; - - return out.pZ; -} - -static void *utf8toUtf16(const unsigned char *pIn, int N, int big_endian){ - UtfString in; - UtfString out; - - in.pZ = (unsigned char *)pIn; - in.n = N; - in.c = 0; - - if( in.n<0 ){ - in.n = strlen(in.pZ); - } - - /* A UTF-16 encoding of a unicode string can require at most twice as - ** much space to store as the same string encoded using UTF-8. Allocate - ** this now. - */ - out.n = (in.n*2) + 2; - out.pZ = sqliteMalloc(out.n); - if( !out.pZ ){ - return 0; - } - out.c = 0; - - while( in.c<in.n ){ - writeUtf16(&out, readUtf8(&in), big_endian); - } - - /* Add the NULL-terminator character */ - assert( (out.c+1)<out.n ); - out.pZ[out.c] = 0x00; - out.pZ[out.c+1] = 0x00; - - return out.pZ; -} - -/* -** Translate UTF-8 to UTF-16BE or UTF-16LE -*/ -void *sqlite3utf8to16be(const unsigned char *pIn, int N){ - return utf8toUtf16(pIn, N, 1); -} - -void *sqlite3utf8to16le(const unsigned char *pIn, int N){ - return utf8toUtf16(pIn, N, 0); -} - -/* -** This routine does the work for sqlite3utf16to16le() and -** sqlite3utf16to16be(). If big_endian is 1 the input string is -** transformed in place to UTF-16BE encoding. If big_endian is 0 then -** the input is transformed to UTF-16LE. -** -** Unless the first two bytes of the input string is a BOM, the input is -** assumed to be UTF-16 encoded using the machines native byte ordering. -*/ -static void utf16to16(void *pData, int N, int big_endian){ - UtfString inout; - inout.pZ = (unsigned char *)pData; - inout.c = 0; - inout.n = N; - - if( inout.n<0 ){ - inout.n = sqlite3utf16ByteLen(inout.pZ, -1); - } - - if( readUtf16Bom(&inout, SQLITE_BIGENDIAN)!=big_endian ){ - /* swab(&inout.pZ[inout.c], inout.pZ, inout.n-inout.c); */ - int i; - for(i=0; i<(inout.n-inout.c); i += 2){ - char c1 = inout.pZ[i+inout.c]; - char c2 = inout.pZ[i+inout.c+1]; - inout.pZ[i] = c2; - inout.pZ[i+1] = c1; - } - }else if( inout.c ){ - memmove(inout.pZ, &inout.pZ[inout.c], inout.n-inout.c); - } - - inout.pZ[inout.n-inout.c] = 0x00; - inout.pZ[inout.n-inout.c+1] = 0x00; -} - -/* -** Convert a string in UTF-16 native byte or with a BOM into a UTF-16LE -** string. The conversion occurs in-place. The output overwrites the -** input. N bytes are converted. If N is negative everything is converted -** up to the first \u0000 character. -** -** If the native byte order is little-endian and there is no BOM, then -** this routine is a no-op. If there is a BOM at the start of the string, -** it is removed. -** -** Translation from UTF-16LE to UTF-16BE and back again is accomplished -** using the library function swab(). -*/ -void sqlite3utf16to16le(void *pData, int N){ - utf16to16(pData, N, 0); -} - -/* -** Convert a string in UTF-16 native byte or with a BOM into a UTF-16BE -** string. The conversion occurs in-place. The output overwrites the -** input. N bytes are converted. If N is negative everything is converted -** up to the first \u0000 character. -** -** If the native byte order is little-endian and there is no BOM, then -** this routine is a no-op. If there is a BOM at the start of the string, -** it is removed. -** -** Translation from UTF-16LE to UTF-16BE and back again is accomplished -** using the library function swab(). -*/ -void sqlite3utf16to16be(void *pData, int N){ - utf16to16(pData, N, 1); -} - -/* -** This function is used to translate between UTF-8 and UTF-16. The -** result is returned in dynamically allocated memory. -*/ -int sqlite3utfTranslate( - const void *zData, int nData, /* Input string */ - u8 enc1, /* Encoding of zData */ - void **zOut, int *nOut, /* Output string */ - u8 enc2 /* Desired encoding of output */ -){ - assert( enc1==SQLITE_UTF8 || enc1==SQLITE_UTF16LE || enc1==SQLITE_UTF16BE ); - assert( enc2==SQLITE_UTF8 || enc2==SQLITE_UTF16LE || enc2==SQLITE_UTF16BE ); - assert( - (enc1==SQLITE_UTF8 && (enc2==SQLITE_UTF16LE || enc2==SQLITE_UTF16BE)) || - (enc2==SQLITE_UTF8 && (enc1==SQLITE_UTF16LE || enc1==SQLITE_UTF16BE)) - ); - - if( enc1==SQLITE_UTF8 ){ - if( enc2==SQLITE_UTF16LE ){ - *zOut = sqlite3utf8to16le(zData, nData); - }else{ - *zOut = sqlite3utf8to16be(zData, nData); - } - if( !(*zOut) ) return SQLITE_NOMEM; - *nOut = sqlite3utf16ByteLen(*zOut, -1); - }else{ - *zOut = sqlite3utf16to8(zData, nData, enc1==SQLITE_UTF16BE); - if( !(*zOut) ) return SQLITE_NOMEM; - *nOut = strlen(*zOut); - } - return SQLITE_OK; -} - -#define sqliteNextChar(X) while( (0xc0&*++(X))==0x80 ){} - -/* ** Compare two UTF-8 strings for equality using the "LIKE" operator of ** SQL. The '%' character matches any sequence of 0 or more ** characters and '_' matches any single character. Case is @@ -731,7 +464,7 @@ int sqlite3utf8LikeCompare( while( (c=zPattern[1]) == '%' || c == '_' ){ if( c=='_' ){ if( *zString==0 ) return 0; - sqliteNextChar(zString); + SKIP_UTF8(zString); } zPattern++; } @@ -744,13 +477,13 @@ int sqlite3utf8LikeCompare( } if( c2==0 ) return 0; if( sqlite3utf8LikeCompare(&zPattern[1],zString) ) return 1; - sqliteNextChar(zString); + SKIP_UTF8(zString); } return 0; } case '_': { if( *zString==0 ) return 0; - sqliteNextChar(zString); + SKIP_UTF8(zString); zPattern++; break; } @@ -764,3 +497,50 @@ int sqlite3utf8LikeCompare( } return *zString==0; } + +#ifndef NDEBUG +/* +** This routine is called from the TCL test function "translate_selftest". +** It checks that the primitives for serializing and deserializing +** characters in each encoding are inverses of each other. +*/ +void sqlite3utfSelfTest(){ + int i; + unsigned char zBuf[20]; + unsigned char *z; + int n; + int c; + + for(i=0; 0 && i<0x00110000; i++){ + z = zBuf; + WRITE_UTF8(z, i); + n = z-zBuf; + z = zBuf; + READ_UTF8(z, c); + assert( c==i ); + assert( (z-zBuf)==n ); + } + for(i=0; i<0x00110000; i++){ + if( i>=0xD800 && i<=0xE000 ) continue; + z = zBuf; + WRITE_UTF16LE(z, i); + n = z-zBuf; + z = zBuf; + READ_UTF16LE(z, c); + assert( c==i ); + assert( (z-zBuf)==n ); + } + for(i=0; i<0x00110000; i++){ + if( i>=0xD800 && i<=0xE000 ) continue; + z = zBuf; + WRITE_UTF16BE(z, i); + n = z-zBuf; + z = zBuf; + READ_UTF16BE(z, c); + assert( c==i ); + assert( (z-zBuf)==n ); + } +} +#endif + + |