diff options
author | danielk1977 <danielk1977@noemail.net> | 2004-06-18 04:24:54 +0000 |
---|---|---|
committer | danielk1977 <danielk1977@noemail.net> | 2004-06-18 04:24:54 +0000 |
commit | bfd6cce56bbb02a96fd7599ff89e1e807fa2df29 (patch) | |
tree | e0d7c19ec2260b540dc806932b07a75dec67f2d9 /src | |
parent | a2854229224e9e13eab1a9e9031057e6a259c38c (diff) | |
download | sqlite-bfd6cce56bbb02a96fd7599ff89e1e807fa2df29.tar.gz sqlite-bfd6cce56bbb02a96fd7599ff89e1e807fa2df29.zip |
Optimisation for unicode encoding conversion routines. (CVS 1614)
FossilOrigin-Name: 39a415eaa65964742e40b7ea4d471fa04007c6c9
Diffstat (limited to 'src')
-rw-r--r-- | src/build.c | 23 | ||||
-rw-r--r-- | src/main.c | 127 | ||||
-rw-r--r-- | src/sqliteInt.h | 21 | ||||
-rw-r--r-- | src/test1.c | 6 | ||||
-rw-r--r-- | src/test5.c | 284 | ||||
-rw-r--r-- | src/tokenize.c | 20 | ||||
-rw-r--r-- | src/utf.c | 946 | ||||
-rw-r--r-- | src/util.c | 38 | ||||
-rw-r--r-- | src/vdbe.c | 39 | ||||
-rw-r--r-- | src/vdbeInt.h | 2 | ||||
-rw-r--r-- | src/vdbeapi.c | 15 | ||||
-rw-r--r-- | src/vdbemem.c | 93 |
12 files changed, 630 insertions, 984 deletions
diff --git a/src/build.c b/src/build.c index 1cdc7974f..0114aeb47 100644 --- a/src/build.c +++ b/src/build.c @@ -23,7 +23,7 @@ ** ROLLBACK ** PRAGMA ** -** $Id: build.c,v 1.220 2004/06/17 06:13:34 danielk1977 Exp $ +** $Id: build.c,v 1.221 2004/06/18 04:24:54 danielk1977 Exp $ */ #include "sqliteInt.h" #include <ctype.h> @@ -959,15 +959,12 @@ static void callCollNeeded(sqlite *db, const char *zName, int nName){ db->xCollNeeded(db->pCollNeededArg, db, (int)db->enc, zExternal); } if( db->xCollNeeded16 ){ - if( SQLITE_BIGENDIAN ){ - zExternal = sqlite3utf8to16be(zName, nName); - }else{ - zExternal = sqlite3utf8to16le(zName, nName); - } + sqlite3_value *pTmp = sqlite3GetTransientValue(db); + sqlite3ValueSetStr(pTmp, -1, zName, SQLITE_UTF8, SQLITE_STATIC); + zExternal = sqlite3ValueText(pTmp, SQLITE_UTF16NATIVE); if( !zExternal ) return; db->xCollNeeded16(db->pCollNeededArg, db, (int)db->enc, zExternal); } - if( zExternal ) sqliteFree(zExternal); } static int synthCollSeq(Parse *pParse, CollSeq *pColl){ @@ -2627,3 +2624,15 @@ void sqlite3EndWriteOperation(Parse *pParse){ /* Delete me! */ return; } + +/* +** Return the transient sqlite3_value object used for encoding conversions +** during SQL compilation. +*/ +sqlite3_value *sqlite3GetTransientValue(sqlite *db){ + if( !db->pValue ){ + db->pValue = sqlite3ValueNew(); + } + return db->pValue; +} + diff --git a/src/main.c b/src/main.c index fa4bcc211..b7022d72e 100644 --- a/src/main.c +++ b/src/main.c @@ -14,7 +14,7 @@ ** other files are for internal use by SQLite and should not be ** accessed by users of the library. ** -** $Id: main.c,v 1.224 2004/06/16 12:00:56 danielk1977 Exp $ +** $Id: main.c,v 1.225 2004/06/18 04:24:54 danielk1977 Exp $ */ #include "sqliteInt.h" #include "os.h" @@ -515,6 +515,12 @@ void sqlite3_close(sqlite *db){ sqlite3HashClear(&db->aFunc); sqlite3Error(db, SQLITE_OK, 0); /* Deallocates any cached error strings. */ + if( db->pValue ){ + sqlite3ValueFree(db->pValue); + } + if( db->pErr ){ + sqlite3ValueFree(db->pErr); + } sqliteFree(db); } @@ -749,14 +755,17 @@ int sqlite3_create_function16( void (*xFinal)(sqlite3_context*) ){ int rc; - char *zFunctionName8; - zFunctionName8 = sqlite3utf16to8(zFunctionName, -1, SQLITE_BIGENDIAN); - if( !zFunctionName8 ){ + char const *zFunc8; + + sqlite3_value *pTmp = sqlite3GetTransientValue(db); + sqlite3ValueSetStr(pTmp, -1, zFunctionName, SQLITE_UTF16NATIVE,SQLITE_STATIC); + zFunc8 = sqlite3ValueText(pTmp, SQLITE_UTF8); + + if( !zFunc8 ){ return SQLITE_NOMEM; } - rc = sqlite3_create_function(db, zFunctionName8, nArg, eTextRep, + rc = sqlite3_create_function(db, zFunc8, nArg, eTextRep, iCollateArg, pUserData, xFunc, xStep, xFinal); - sqliteFree(zFunctionName8); return rc; } @@ -844,16 +853,16 @@ int sqlite3BtreeFactory( ** error. */ const char *sqlite3_errmsg(sqlite3 *db){ - if( !db ){ + if( !db || !db->pErr ){ /* If db is NULL, then assume that a malloc() failed during an ** sqlite3_open() call. */ return sqlite3ErrStr(SQLITE_NOMEM); } - if( db->zErrMsg ){ - return db->zErrMsg; + if( !sqlite3_value_text(db->pErr) ){ + return sqlite3ErrStr(db->errCode); } - return sqlite3ErrStr(db->errCode); + return sqlite3_value_text(db->pErr); } /* @@ -861,38 +870,32 @@ const char *sqlite3_errmsg(sqlite3 *db){ ** error. */ const void *sqlite3_errmsg16(sqlite3 *db){ - if( !db ){ - /* If db is NULL, then assume that a malloc() failed during an - ** sqlite3_open() call. We have a static version of the string - ** "out of memory" encoded using UTF-16 just for this purpose. - ** - ** Because all the characters in the string are in the unicode - ** range 0x00-0xFF, if we pad the big-endian string with a - ** zero byte, we can obtain the little-endian string with - ** &big_endian[1]. - */ - static char outOfMemBe[] = { - 0, 'o', 0, 'u', 0, 't', 0, ' ', - 0, 'o', 0, 'f', 0, ' ', - 0, 'm', 0, 'e', 0, 'm', 0, 'o', 0, 'r', 0, 'y', 0, 0, 0 - }; - static char *outOfMemLe = &outOfMemBe[1]; - - if( SQLITE_BIGENDIAN ){ - return (void *)outOfMemBe; - }else{ - return (void *)outOfMemLe; + /* Because all the characters in the string are in the unicode + ** range 0x00-0xFF, if we pad the big-endian string with a + ** zero byte, we can obtain the little-endian string with + ** &big_endian[1]. + */ + static char outOfMemBe[] = { + 0, 'o', 0, 'u', 0, 't', 0, ' ', + 0, 'o', 0, 'f', 0, ' ', + 0, 'm', 0, 'e', 0, 'm', 0, 'o', 0, 'r', 0, 'y', 0, 0, 0 + }; + + if( db && db->pErr ){ + if( !sqlite3_value_text16(db->pErr) ){ + sqlite3ValueSetStr(db->pErr, -1, sqlite3ErrStr(db->errCode), + SQLITE_UTF8, SQLITE_STATIC); } - } - if( !db->zErrMsg16 ){ - char const *zErr8 = sqlite3_errmsg(db); - if( SQLITE_BIGENDIAN ){ - db->zErrMsg16 = sqlite3utf8to16be(zErr8, -1); - }else{ - db->zErrMsg16 = sqlite3utf8to16le(zErr8, -1); + if( sqlite3_value_text16(db->pErr) ){ + return sqlite3_value_text16(db->pErr); } - } - return db->zErrMsg16; + } + + /* If db is NULL, then assume that a malloc() failed during an + ** sqlite3_open() call. We have a static version of the string + ** "out of memory" encoded using UTF-16 just for this purpose. + */ + return (void *)(&outOfMemBe[SQLITE_UTF16NATIVE==SQLITE_UTF16LE?1:0]); } int sqlite3_errcode(sqlite3 *db){ @@ -1047,11 +1050,14 @@ int sqlite3_prepare16( ** encoded string to UTF-8, then invoking sqlite3_prepare(). The ** tricky bit is figuring out the pointer to return in *pzTail. */ - char *zSql8 = 0; + char const *zSql8 = 0; char const *zTail8 = 0; int rc; + sqlite3_value *pTmp; - zSql8 = sqlite3utf16to8(zSql, nBytes, SQLITE_BIGENDIAN); + pTmp = sqlite3GetTransientValue(db); + sqlite3ValueSetStr(pTmp, -1, zSql, SQLITE_UTF16NATIVE, SQLITE_STATIC); + zSql8 = sqlite3ValueText(pTmp, SQLITE_UTF8); if( !zSql8 ){ sqlite3Error(db, SQLITE_NOMEM, 0); return SQLITE_NOMEM; @@ -1067,7 +1073,6 @@ int sqlite3_prepare16( int chars_parsed = sqlite3utf8CharLen(zSql8, zTail8-zSql8); *pzTail = (u8 *)zSql + sqlite3utf16ByteLen(zSql, chars_parsed); } - sqliteFree(zSql8); return rc; } @@ -1134,7 +1139,6 @@ static int openDatabase( } rc = sqlite3BtreeFactory(db, zFilename, 0, MAX_PAGES, &db->aDb[0].pBt); if( rc!=SQLITE_OK ){ - /* FIX ME: sqlite3BtreeFactory() should call sqlite3Error(). */ sqlite3Error(db, rc, 0); db->magic = SQLITE_MAGIC_CLOSED; goto opendb_out; @@ -1148,6 +1152,7 @@ static int openDatabase( */ sqlite3RegisterBuiltinFunctions(db); if( rc==SQLITE_OK ){ + sqlite3Error(db, SQLITE_OK, 0); db->magic = SQLITE_MAGIC_OPEN; }else{ sqlite3Error(db, rc, "%s", zErrMsg, 0); @@ -1177,21 +1182,24 @@ int sqlite3_open16( const void *zFilename, sqlite3 **ppDb ){ - char *zFilename8; /* zFilename encoded in UTF-8 instead of UTF-16 */ - int rc; + char const *zFilename8; /* zFilename encoded in UTF-8 instead of UTF-16 */ + int rc = SQLITE_NOMEM; + sqlite3_value *pVal; assert( ppDb ); - - zFilename8 = sqlite3utf16to8(zFilename, -1, SQLITE_BIGENDIAN); - if( !zFilename8 ){ - *ppDb = 0; - return SQLITE_NOMEM; + *ppDb = 0; + pVal = sqlite3ValueNew(); + sqlite3ValueSetStr(pVal, -1, zFilename, SQLITE_UTF16NATIVE, SQLITE_STATIC); + zFilename8 = sqlite3ValueText(pVal, SQLITE_UTF8); + if( zFilename8 ){ + rc = openDatabase(zFilename8, ppDb); + if( rc==SQLITE_OK && *ppDb ){ + sqlite3_exec(*ppDb, "PRAGMA encoding = 'UTF-16'", 0, 0, 0); + } } - rc = openDatabase(zFilename8, ppDb); - if( rc==SQLITE_OK && *ppDb ){ - sqlite3_exec(*ppDb, "PRAGMA encoding = 'UTF-16'", 0, 0, 0); + if( pVal ){ + sqlite3ValueFree(pVal); } - sqliteFree(zFilename8); return rc; } @@ -1273,10 +1281,11 @@ int sqlite3_create_collation16( int(*xCompare)(void*,int,const void*,int,const void*) ){ int rc; - char *zName8 = sqlite3utf16to8(zName, -1, SQLITE_BIGENDIAN); - rc = sqlite3_create_collation(db, zName8, enc, pCtx, xCompare); - sqliteFree(zName8); - return rc; + char const *zName8; + sqlite3_value *pTmp = sqlite3GetTransientValue(db); + sqlite3ValueSetStr(pTmp, -1, zName, SQLITE_UTF16NATIVE, SQLITE_STATIC); + zName8 = sqlite3ValueText(pTmp, SQLITE_UTF8); + return sqlite3_create_collation(db, zName8, enc, pCtx, xCompare); } /* diff --git a/src/sqliteInt.h b/src/sqliteInt.h index 416642534..05f51304e 100644 --- a/src/sqliteInt.h +++ b/src/sqliteInt.h @@ -11,7 +11,7 @@ ************************************************************************* ** Internal interface definitions for SQLite. ** -** @(#) $Id: sqliteInt.h,v 1.286 2004/06/17 05:36:44 danielk1977 Exp $ +** @(#) $Id: sqliteInt.h,v 1.287 2004/06/18 04:24:54 danielk1977 Exp $ */ #include "config.h" #include "sqlite3.h" @@ -194,6 +194,7 @@ extern const int sqlite3one; # define sqliteStrNDup(X,Y) sqlite3StrNDup_(X,Y,__FILE__,__LINE__) void sqlite3StrRealloc(char**); #else +# define sqlite3FreeX sqliteFree # define sqlite3Realloc_(X,Y) sqliteRealloc(X,Y) # define sqlite3StrRealloc(X) #endif @@ -422,14 +423,17 @@ struct sqlite { #endif int errCode; /* Most recent error code (SQLITE_*) */ - char *zErrMsg; /* Most recent error message (UTF-8 encoded) */ - void *zErrMsg16; /* Most recent error message (UTF-16 encoded) */ u8 enc; /* Text encoding for this database. */ u8 autoCommit; /* The auto-commit flag. */ int nMaster; /* Length of master journal name. -1=unknown */ void(*xCollNeeded)(void*,sqlite3*,int eTextRep,const char*); void(*xCollNeeded16)(void*,sqlite3*,int eTextRep,const void*); void *pCollNeededArg; + sqlite3_value *pValue; /* Value used for transient conversions */ + sqlite3_value *pErr; /* Most recent error message */ + + char *zErrMsg; /* Most recent error message (UTF-8 encoded) */ + char *zErrMsg16; /* Most recent error message (UTF-8 encoded) */ }; /* @@ -1213,6 +1217,7 @@ void sqlite3RealToSortable(double r, char *); char *sqlite3StrDup_(const char*,char*,int); char *sqlite3StrNDup_(const char*, int,char*,int); void sqlite3CheckMemory(void*,int); + void sqlite3FreeX(void *p); #else void *sqliteMalloc(int); void *sqliteMallocRaw(int); @@ -1375,11 +1380,6 @@ char *sqlite3_snprintf(int,char*,const char*,...); int sqlite3GetInt32(const char *, int*); int sqlite3GetInt64(const char *, i64*); int sqlite3FitsIn64Bits(const char *); -unsigned char *sqlite3utf16to8(const void *pData, int N, int big_endian); -void *sqlite3utf8to16be(const unsigned char *pIn, int N); -void *sqlite3utf8to16le(const unsigned char *pIn, int N); -void sqlite3utf16to16le(void *pData, int N); -void sqlite3utf16to16be(void *pData, int N); int sqlite3utf16ByteLen(const void *pData, int nChar); int sqlite3utf8CharLen(const char *pData, int nByte); int sqlite3utf8LikeCompare(const unsigned char*, const unsigned char*); @@ -1396,8 +1396,6 @@ int sqlite3IndexAffinityOk(Expr *pExpr, char idx_affinity); char sqlite3ExprAffinity(Expr *pExpr); int sqlite3atoi64(const char*, i64*); void sqlite3Error(sqlite *, int, const char*,...); -int sqlite3utfTranslate(const void *, int , u8 , void **, int *, u8); -u8 sqlite3UtfReadBom(const void *zData, int nData); void *sqlite3HexToBlob(const char *z); int sqlite3TwoPartName(Parse *, Token *, Token *, Token **); const char *sqlite3ErrStr(int); @@ -1412,6 +1410,7 @@ int sqlite3CheckObjectName(Parse *, const char *); const void *sqlite3ValueText(sqlite3_value*, u8); int sqlite3ValueBytes(sqlite3_value*, u8); -void sqlite3ValueSetStr(sqlite3_value*, int, const void *,u8); +void sqlite3ValueSetStr(sqlite3_value*, int, const void *,u8, void(*)(void*)); void sqlite3ValueFree(sqlite3_value*); sqlite3_value *sqlite3ValueNew(); +sqlite3_value *sqlite3GetTransientValue(sqlite *db); diff --git a/src/test1.c b/src/test1.c index ce122405f..e5cc0e218 100644 --- a/src/test1.c +++ b/src/test1.c @@ -13,7 +13,7 @@ ** is not included in the SQLite library. It is used for automated ** testing of the SQLite library. ** -** $Id: test1.c,v 1.77 2004/06/15 02:44:19 danielk1977 Exp $ +** $Id: test1.c,v 1.78 2004/06/18 04:24:55 danielk1977 Exp $ */ #include "sqliteInt.h" #include "tcl.h" @@ -940,9 +940,9 @@ static int test_collate_func( } pVal = sqlite3ValueNew(); - sqlite3ValueSetStr(pVal, nA, zA, encin); + sqlite3ValueSetStr(pVal, nA, zA, encin, SQLITE_STATIC); Tcl_ListObjAppendElement(i,pX,Tcl_NewStringObj(sqlite3_value_text(pVal),-1)); - sqlite3ValueSetStr(pVal, nB, zB, encin); + sqlite3ValueSetStr(pVal, nB, zB, encin, SQLITE_STATIC); Tcl_ListObjAppendElement(i,pX,Tcl_NewStringObj(sqlite3_value_text(pVal),-1)); sqlite3ValueFree(pVal); diff --git a/src/test5.c b/src/test5.c index 525716be1..8ce005323 100644 --- a/src/test5.c +++ b/src/test5.c @@ -15,7 +15,7 @@ ** is used for testing the SQLite routines for converting between ** the various supported unicode encodings. ** -** $Id: test5.c,v 1.10 2004/06/12 00:42:35 danielk1977 Exp $ +** $Id: test5.c,v 1.11 2004/06/18 04:24:55 danielk1977 Exp $ */ #include "sqliteInt.h" #include "vdbeInt.h" @@ -25,195 +25,6 @@ #include <string.h> /* -** Return the number of bytes up to and including the first pair of -** 0x00 bytes in *pStr. -*/ -static int utf16_length(const unsigned char *pZ){ - const unsigned char *pC1 = pZ; - const unsigned char *pC2 = pZ+1; - while( *pC1 || *pC2 ){ - pC1 += 2; - pC2 += 2; - } - return (pC1-pZ)+2; -} - -/* -** tclcmd: sqlite_utf8to16le STRING -** title: Convert STRING from utf-8 to utf-16le -** -** Return the utf-16le encoded string -*/ -static int sqlite_utf8to16le( - void * clientData, - Tcl_Interp *interp, - int objc, - Tcl_Obj *CONST objv[] -){ - unsigned char *out; - unsigned char *in; - Tcl_Obj *res; - - if( objc!=2 ){ - Tcl_AppendResult(interp, "wrong # args: should be \"", - Tcl_GetStringFromObj(objv[0], 0), "<utf-8 encoded-string>", 0); - return TCL_ERROR; - } - - in = Tcl_GetString(objv[1]); - out = (unsigned char *)sqlite3utf8to16le(in, -1); - res = Tcl_NewByteArrayObj(out, utf16_length(out)); - sqliteFree(out); - - Tcl_SetObjResult(interp, res); - - return TCL_OK; -} - -/* -** tclcmd: sqlite_utf8to16be STRING -** title: Convert STRING from utf-8 to utf-16be -** -** Return the utf-16be encoded string -*/ -static int sqlite_utf8to16be( - void * clientData, - Tcl_Interp *interp, - int objc, - Tcl_Obj *CONST objv[] -){ - unsigned char *out; - unsigned char *in; - Tcl_Obj *res; - - if( objc!=2 ){ - Tcl_AppendResult(interp, "wrong # args: should be \"", - Tcl_GetStringFromObj(objv[0], 0), "<utf-8 encoded-string>", 0); - return TCL_ERROR; - } - - in = Tcl_GetByteArrayFromObj(objv[1], 0); - in = Tcl_GetString(objv[1]); - out = (unsigned char *)sqlite3utf8to16be(in, -1); - res = Tcl_NewByteArrayObj(out, utf16_length(out)); - sqliteFree(out); - - Tcl_SetObjResult(interp, res); - - return TCL_OK; -} - -/* -** tclcmd: sqlite_utf16to16le STRING -** title: Convert STRING from utf-16 in native byte order to utf-16le -** -** Return the utf-16le encoded string. If the input string contains -** a byte-order mark, then the byte order mark should override the -** native byte order. -*/ -static int sqlite_utf16to16le( - void * clientData, - Tcl_Interp *interp, - int objc, - Tcl_Obj *CONST objv[] -){ - unsigned char *out; - unsigned char *in; - int in_len; - Tcl_Obj *res; - - if( objc!=2 ){ - Tcl_AppendResult(interp, "wrong # args: should be \"", - Tcl_GetStringFromObj(objv[0], 0), "<utf-16 encoded-string>", 0); - return TCL_ERROR; - } - - in = Tcl_GetByteArrayFromObj(objv[1], &in_len); - out = (unsigned char *)sqliteMalloc(in_len); - memcpy(out, in, in_len); - - sqlite3utf16to16le(out, -1); - res = Tcl_NewByteArrayObj(out, utf16_length(out)); - sqliteFree(out); - - Tcl_SetObjResult(interp, res); - - return TCL_OK; -} - -/* -** tclcmd: sqlite_utf16to16be STRING -** title: Convert STRING from utf-16 in native byte order to utf-16be -** -** Return the utf-16be encoded string. If the input string contains -** a byte-order mark, then the byte order mark should override the -** native byte order. -*/ -static int sqlite_utf16to16be( - void * clientData, - Tcl_Interp *interp, - int objc, - Tcl_Obj *CONST objv[] -){ - unsigned char *out; - unsigned char *in; - int in_len; - Tcl_Obj *res; - - if( objc!=2 ){ - Tcl_AppendResult(interp, "wrong # args: should be \"", - Tcl_GetStringFromObj(objv[0], 0), "<utf-16 encoded-string>", 0); - return TCL_ERROR; - } - - in = Tcl_GetByteArrayFromObj(objv[1], &in_len); - out = (unsigned char *)sqliteMalloc(in_len); - memcpy(out, in, in_len); - - sqlite3utf16to16be(out, -1); - res = Tcl_NewByteArrayObj(out, utf16_length(out)); - sqliteFree(out); - - Tcl_SetObjResult(interp, res); - - return TCL_OK; -} - -/* -** tclcmd: sqlite_utf16to8 STRING -** title: Convert STRING from utf-16 in native byte order to utf-8 -** -** Return the utf-8 encoded string. If the input string contains -** a byte-order mark, then the byte order mark should override the -** native byte order. -*/ -static int sqlite_utf16to8( - void * clientData, - Tcl_Interp *interp, - int objc, - Tcl_Obj *CONST objv[] -){ - unsigned char *out; - unsigned char *in; - Tcl_Obj *res; - - if( objc!=2 ){ - Tcl_AppendResult(interp, "wrong # args: should be \"", - Tcl_GetStringFromObj(objv[0], 0), " <utf-16 encoded-string>", 0); - return TCL_ERROR; - } - - in = Tcl_GetByteArrayFromObj(objv[1], 0); - out = sqlite3utf16to8(in, -1, SQLITE_BIGENDIAN); - res = Tcl_NewByteArrayObj(out, strlen(out)+1); - sqliteFree(out); - - Tcl_SetObjResult(interp, res); - - return TCL_OK; -} - -/* ** The first argument is a TCL UTF-8 string. Return the byte array ** object with the encoded representation of the string, including ** the NULL terminator. @@ -281,6 +92,92 @@ static int test_value_overhead( return TCL_OK; } +static u8 name_to_enc(Tcl_Interp *interp, Tcl_Obj *pObj){ + struct EncName { + char *zName; + u8 enc; + } encnames[] = { + { "UTF8", SQLITE_UTF8 }, + { "UTF16LE", SQLITE_UTF16LE }, + { "UTF16BE", SQLITE_UTF16BE }, + { "UTF16", SQLITE_UTF16NATIVE }, + { 0, 0 } + }; + struct EncName *pEnc; + char *z = Tcl_GetString(pObj); + for(pEnc=&encnames[0]; pEnc->zName; pEnc++){ + if( 0==sqlite3StrICmp(z, pEnc->zName) ){ + break; + } + } + if( !pEnc->enc ){ + Tcl_AppendResult(interp, "No such encoding: ", z, 0); + } + return pEnc->enc; +} + +static int test_translate( + void * clientData, + Tcl_Interp *interp, + int objc, + Tcl_Obj *CONST objv[] +){ + u8 enc_from; + u8 enc_to; + sqlite3_value *pVal; + + const char *z; + int len; + + if( objc!=4 ){ + Tcl_AppendResult(interp, "wrong # args: should be \"", + Tcl_GetStringFromObj(objv[0], 0), + " <string/blob> <from enc> <to enc>", 0 + ); + return TCL_ERROR; + } + + enc_from = name_to_enc(interp, objv[2]); + if( !enc_from ) return TCL_ERROR; + enc_to = name_to_enc(interp, objv[3]); + if( !enc_to ) return TCL_ERROR; + + pVal = sqlite3ValueNew(); + + if( enc_from==SQLITE_UTF8 ){ + z = Tcl_GetString(objv[1]); + sqlite3ValueSetStr(pVal, -1, z, enc_from, SQLITE_STATIC); + }else{ + z = Tcl_GetByteArrayFromObj(objv[1], &len); + sqlite3ValueSetStr(pVal, -1, z, enc_from, SQLITE_STATIC); + } + + z = sqlite3ValueText(pVal, enc_to); + len = sqlite3ValueBytes(pVal, enc_to) + (enc_to==SQLITE_UTF8?1:2); + Tcl_SetObjResult(interp, Tcl_NewByteArrayObj(z, len)); + + sqlite3ValueFree(pVal); + + return TCL_OK; +} + +/* +** Usage: translate_selftest +** +** Call sqlite3utfSelfTest() to run the internal tests for unicode +** translation. If there is a problem an assert() will fail. +**/ +void sqlite3utfSelfTest(); +static int test_translate_selftest( + void * clientData, + Tcl_Interp *interp, + int objc, + Tcl_Obj *CONST objv[] +){ + sqlite3utfSelfTest(); + return SQLITE_OK; +} + /* ** Register commands with the TCL interpreter. @@ -290,13 +187,10 @@ int Sqlitetest5_Init(Tcl_Interp *interp){ char *zName; Tcl_ObjCmdProc *xProc; } aCmd[] = { - { "sqlite_utf16to8", (Tcl_ObjCmdProc*)sqlite_utf16to8 }, - { "sqlite_utf8to16le", (Tcl_ObjCmdProc*)sqlite_utf8to16le }, - { "sqlite_utf8to16be", (Tcl_ObjCmdProc*)sqlite_utf8to16be }, - { "sqlite_utf16to16le", (Tcl_ObjCmdProc*)sqlite_utf16to16le }, - { "sqlite_utf16to16be", (Tcl_ObjCmdProc*)sqlite_utf16to16be }, { "binarize", (Tcl_ObjCmdProc*)binarize }, { "test_value_overhead", (Tcl_ObjCmdProc*)test_value_overhead }, + { "test_translate", (Tcl_ObjCmdProc*)test_translate }, + { "translate_selftest", (Tcl_ObjCmdProc*)test_translate_selftest}, }; int i; for(i=0; i<sizeof(aCmd)/sizeof(aCmd[0]); i++){ diff --git a/src/tokenize.c b/src/tokenize.c index aacf745d4..7f7e981fd 100644 --- a/src/tokenize.c +++ b/src/tokenize.c @@ -15,7 +15,7 @@ ** individual tokens and sends those tokens one-by-one over to the ** parser for analysis. ** -** $Id: tokenize.c,v 1.76 2004/05/31 23:56:43 danielk1977 Exp $ +** $Id: tokenize.c,v 1.77 2004/06/18 04:24:55 danielk1977 Exp $ */ #include "sqliteInt.h" #include "os.h" @@ -701,10 +701,18 @@ int sqlite3_complete(const char *zSql){ ** UTF-8. */ int sqlite3_complete16(const void *zSql){ - int rc; - char *zSql8 = sqlite3utf16to8(zSql, -1, SQLITE_BIGENDIAN); - if( !zSql8 ) return 0; - rc = sqlite3_complete(zSql8); - sqliteFree(zSql8); + sqlite3_value *pVal; + char *zSql8; + int rc = 0; + + pVal = sqlite3ValueNew(); + sqlite3ValueSetStr(pVal, -1, zSql, SQLITE_UTF16NATIVE, SQLITE_STATIC); + zSql8 = sqlite3ValueText(pVal, SQLITE_UTF8); + if( zSql8 ){ + rc = sqlite3_complete(zSql8); + sqliteFree(zSql8); + } + sqlite3ValueFree(pVal); return rc; } + @@ -12,7 +12,7 @@ ** This file contains routines used to translate between UTF-8, ** UTF-16, UTF-16BE, and UTF-16LE. ** -** $Id: utf.c,v 1.20 2004/06/17 05:36:44 danielk1977 Exp $ +** $Id: utf.c,v 1.21 2004/06/18 04:24:55 danielk1977 Exp $ ** ** Notes on UTF-8: ** @@ -48,31 +48,19 @@ ** When converting malformed UTF-16 strings to UTF-8, one instance of the ** replacement character U+FFFD for each pair of bytes that cannot be ** interpeted as part of a valid unicode character. +** +** This file contains the following public routines: +** +** sqlite3VdbeMemTranslate() - Translate the encoding used by a Mem* string. +** sqlite3VdbeMemHandleBom() - Handle byte-order-marks in UTF16 Mem* strings. +** sqlite3utf16ByteLen() - Calculate byte-length of a void* UTF16 string. +** sqlite3utf8CharLen() - Calculate char-length of a char* UTF8 string. +** sqlite3utf8LikeCompare() - Do a LIKE match given two UTF8 char* strings. +** */ #include <assert.h> #include "sqliteInt.h" - -typedef struct UtfString UtfString; -struct UtfString { - unsigned char *pZ; /* Raw string data */ - int n; /* Allocated length of pZ in bytes */ - int c; /* Number of pZ bytes already read or written */ -}; - -/* -** These two macros are used to interpret the first two bytes of the -** unsigned char array pZ as a 16-bit unsigned int. BE16() for a big-endian -** interpretation, LE16() for little-endian. -*/ -#define BE16(pZ) (((u16)((pZ)[0])<<8) + (u16)((pZ)[1])) -#define LE16(pZ) (((u16)((pZ)[1])<<8) + (u16)((pZ)[0])) - -/* -** READ_16 interprets the first two bytes of the unsigned char array pZ -** as a 16-bit unsigned int. If big_endian is non-zero the intepretation -** is big-endian, otherwise little-endian. -*/ -#define READ_16(pZ,big_endian) (big_endian?BE16(pZ):LE16(pZ)) +#include "vdbeInt.h" /* ** The following macro, LOWERCASE(x), takes an integer representing a @@ -96,353 +84,317 @@ static unsigned char UpperToLower[91] = { }; /* -** The first parameter, zStr, points at a unicode string. This routine -** reads a single character from the string and returns the codepoint value -** of the character read. -** -** The value of *pEnc is the string encoding. If *pEnc is SQLITE_UTF16LE or -** SQLITE_UTF16BE, and the first character read is a byte-order-mark, then -** the value of *pEnc is modified if necessary. In this case the next -** character is read and it's code-point value returned. -** -** The value of *pOffset is the byte-offset in zStr from which to begin -** reading. It is incremented by the number of bytes read by this function. -** -** If the fourth parameter, fold, is non-zero, then codepoint values are -** folded to lower-case before being returned. See comments for macro -** LOWERCASE(x) for details. +** This table maps from the first byte of a UTF-8 character to the number +** of trailing bytes expected. A value '255' indicates that the table key +** is not a legal first byte for a UTF-8 character. */ -int sqlite3ReadUniChar(const char *zStr, int *pOffset, u8 *pEnc, int fold){ - int ret = 0; - - switch( *pEnc ){ - case SQLITE_UTF8: { - -#if 0 - static const int initVal[] = { - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, - 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, - 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, - 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, - 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, - 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, - 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, - 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, - 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, - 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, - 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, - 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, - 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 0, 1, 2, - 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, - 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, - 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, - 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 254, - 255, - }; - ret = initVal[(unsigned char)zStr[(*pOffset)++]]; - while( (0xc0&zStr[*pOffset])==0x80 ){ - ret = (ret<<6) | (0x3f&(zStr[(*pOffset)++])); - } -#endif - - struct Utf8TblRow { - u8 b1_mask; - u8 b1_masked_val; - u8 b1_value_mask; - int trailing_bytes; - }; - static const struct Utf8TblRow utf8tbl[] = { - { 0x80, 0x00, 0x7F, 0 }, - { 0xE0, 0xC0, 0x1F, 1 }, - { 0xF0, 0xE0, 0x0F, 2 }, - { 0xF8, 0xF0, 0x0E, 3 }, - { 0, 0, 0, 0} - }; - - u8 b1; /* First byte of the potentially multi-byte utf-8 character */ - int ii; - struct Utf8TblRow const *pRow; - - pRow = &(utf8tbl[0]); - - b1 = zStr[(*pOffset)++]; - while( pRow->b1_mask && (b1&pRow->b1_mask)!=pRow->b1_masked_val ){ - pRow++; - } - if( !pRow->b1_mask ){ - return (int)0xFFFD; - } - - ret = (u32)(b1&pRow->b1_value_mask); - for( ii=0; ii<pRow->trailing_bytes; ii++ ){ - u8 b = zStr[(*pOffset)++]; - if( (b&0xC0)!=0x80 ){ - return (int)0xFFFD; - } - ret = (ret<<6) + (u32)(b&0x3F); - } - break; - } - - case SQLITE_UTF16LE: - case SQLITE_UTF16BE: { - u32 code_point; /* the first code-point in the character */ - u32 code_point2; /* the second code-point in the character, if any */ - - code_point = READ_16(&zStr[*pOffset], (*pEnc==SQLITE_UTF16BE)); - *pOffset += 2; - - /* If this is a non-surrogate code-point, just cast it to an int and - ** this is the code-point value. - */ - if( code_point<0xD800 || code_point>0xE000 ){ - ret = code_point; - break; - } - - /* If this is a trailing surrogate code-point, then the string is - ** malformed; return the replacement character. - */ - if( code_point>0xDBFF ){ - return (int)0xFFFD; - } - - /* The code-point just read is a leading surrogate code-point. If their - ** is not enough data left or the next code-point is not a trailing - ** surrogate, return the replacement character. - */ - code_point2 = READ_16(&zStr[*pOffset], (*pEnc==SQLITE_UTF16BE)); - *pOffset += 2; - if( code_point2<0xDC00 || code_point>0xDFFF ){ - return (int)0xFFFD; - } - - ret = ( - (((code_point&0x03C0)+0x0040)<<16) + /* uuuuu */ - ((code_point&0x003F)<<10) + /* xxxxxx */ - (code_point2&0x03FF) /* yy yyyyyyyy */ - ); - } - default: - assert(0); - } - - if( fold ){ - return LOWERCASE(ret); - } - return ret; -} +static const u8 xtra_utf8_bytes[256] = { +/* 0xxxxxxx */ +0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + +/* 10wwwwww */ +255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, +255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, +255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, +255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + +/* 110yyyyy */ +1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, +1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + +/* 1110zzzz */ +2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + +/* 11110yyy */ +3, 3, 3, 3, 3, 3, 3, 3, 255, 255, 255, 255, 255, 255, 255, 255, +}; /* -** Read the BOM from the start of *pStr, if one is present. Return zero -** for little-endian, non-zero for big-endian. If no BOM is present, return -** the value of the parameter "big_endian". -** -** Return values: -** 1 -> big-endian string -** 0 -> little-endian string +** This table maps from the number of trailing bytes in a UTF-8 character +** to an integer constant that is effectively calculated for each character +** read by a naive implementation of a UTF-8 character reader. The code +** in the READ_UTF8 macro explains things best. */ -static int readUtf16Bom(UtfString *pStr, int big_endian){ - /* The BOM must be the first thing read from the string */ - assert( pStr->c==0 ); - - /* If the string data consists of 1 byte or less, the BOM will make no - ** difference anyway. In this case just fall through to the default case - ** and return the native byte-order for this machine. - ** - ** Otherwise, check the first 2 bytes of the string to see if a BOM is - ** present. - */ - if( pStr->n>1 ){ - u8 bom = sqlite3UtfReadBom(pStr->pZ, 2); - if( bom ){ - pStr->c += 2; - return (bom==SQLITE_UTF16LE)?0:1; - } - } +static const int xtra_utf8_bits[4] = { +0, +12416, /* (0xC0 << 6) + (0x80) */ +925824, /* (0xE0 << 12) + (0x80 << 6) + (0x80) */ +63447168 /* (0xF0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */ +}; - return big_endian; +#define READ_UTF8(zIn, c) { \ + int xtra; \ + c = *(zIn)++; \ + xtra = xtra_utf8_bytes[c]; \ + switch( xtra ){ \ + case 255: c = (int)0xFFFD; break; \ + case 3: c = (c<<6) + *(zIn)++; \ + case 2: c = (c<<6) + *(zIn)++; \ + case 1: c = (c<<6) + *(zIn)++; \ + c -= xtra_utf8_bits[xtra]; \ + } \ } -/* -** zData is a UTF-16 encoded string, nData bytes in length. This routine -** checks if there is a byte-order mark at the start of zData. If no -** byte order mark is found 0 is returned. Otherwise SQLITE_UTF16BE or -** SQLITE_UTF16LE is returned, depending on whether The BOM indicates that -** the text is big-endian or little-endian. -*/ -u8 sqlite3UtfReadBom(const void *zData, int nData){ - if( nData<0 || nData>1 ){ - u8 b1 = *(u8 *)zData; - u8 b2 = *(((u8 *)zData) + 1); - if( b1==0xFE && b2==0xFF ){ - return SQLITE_UTF16BE; - } - if( b1==0xFF && b2==0xFE ){ - return SQLITE_UTF16LE; - } - } - return 0; +#define SKIP_UTF8(zIn) { \ + zIn += (xtra_utf8_bytes[*(u8 *)zIn] + 1); \ } - -/* -** Read a single unicode character from the UTF-8 encoded string *pStr. The -** value returned is a unicode scalar value. In the case of malformed -** strings, the unicode replacement character U+FFFD may be returned. -*/ -static u32 readUtf8(UtfString *pStr){ - u8 enc = SQLITE_UTF8; - return sqlite3ReadUniChar(pStr->pZ, &pStr->c, &enc, 0); +#define WRITE_UTF8(zOut, c) { \ + if( c<0x00080 ){ \ + *zOut++ = (c&0xFF); \ + } \ + else if( c<0x00800 ){ \ + *zOut++ = 0xC0 + ((c>>6)&0x1F); \ + *zOut++ = 0x80 + (c & 0x3F); \ + } \ + else if( c<0x10000 ){ \ + *zOut++ = 0xE0 + ((c>>12)&0x0F); \ + *zOut++ = 0x80 + ((c>>6) & 0x3F); \ + *zOut++ = 0x80 + (c & 0x3F); \ + }else{ \ + *zOut++ = 0xF0 + ((c>>18) & 0x07); \ + *zOut++ = 0x80 + ((c>>12) & 0x3F); \ + *zOut++ = 0x80 + ((c>>6) & 0x3F); \ + *zOut++ = 0x80 + (c & 0x3F); \ + } \ } -/* -** Write the unicode character 'code' to the string pStr using UTF-8 -** encoding. SQLITE_NOMEM may be returned if sqlite3Malloc() fails. -*/ -static int writeUtf8(UtfString *pStr, u32 code){ - struct Utf8WriteTblRow { - u32 max_code; - int trailing_bytes; - u8 b1_and_mask; - u8 b1_or_mask; - }; - static const struct Utf8WriteTblRow utf8tbl[] = { - {0x0000007F, 0, 0x7F, 0x00}, - {0x000007FF, 1, 0xDF, 0xC0}, - {0x0000FFFF, 2, 0xEF, 0xE0}, - {0x0010FFFF, 3, 0xF7, 0xF0}, - {0x00000000, 0, 0x00, 0x00} - }; - const struct Utf8WriteTblRow *pRow = &utf8tbl[0]; - - while( code>pRow->max_code ){ - assert( pRow->max_code ); - pRow++; - } +#define WRITE_UTF16LE(zOut, c) { \ + if( c<=0xFFFF ){ \ + *zOut++ = (c&0x00FF); \ + *zOut++ = ((c>>8)&0x00FF); \ + }else{ \ + *zOut++ = (((c>>10)&0x003F) + (((c-0x10000)>>10)&0x00C0)); \ + *zOut++ = (0x00D8 + (((c-0x10000)>>18)&0x03)); \ + *zOut++ = (c&0x00FF); \ + *zOut++ = (0x00DC + ((c>>8)&0x03)); \ + } \ +} - /* Ensure there is enough room left in the output buffer to write - ** this UTF-8 character. - */ - assert( (pStr->n-pStr->c)>=(pRow->trailing_bytes+1) ); +#define WRITE_UTF16BE(zOut, c) { \ + if( c<=0xFFFF ){ \ + *zOut++ = ((c>>8)&0x00FF); \ + *zOut++ = (c&0x00FF); \ + }else{ \ + *zOut++ = (0x00D8 + (((c-0x10000)>>18)&0x03)); \ + *zOut++ = (((c>>10)&0x003F) + (((c-0x10000)>>10)&0x00C0)); \ + *zOut++ = (0x00DC + ((c>>8)&0x03)); \ + *zOut++ = (c&0x00FF); \ + } \ +} - /* Write the UTF-8 encoded character to pStr. All cases below are - ** intentionally fall-through. - */ - switch( pRow->trailing_bytes ){ - case 3: - pStr->pZ[pStr->c+3] = (((u8)code)&0x3F)|0x80; - code = code>>6; - case 2: - pStr->pZ[pStr->c+2] = (((u8)code)&0x3F)|0x80; - code = code>>6; - case 1: - pStr->pZ[pStr->c+1] = (((u8)code)&0x3F)|0x80; - code = code>>6; - case 0: - pStr->pZ[pStr->c] = (((u8)code)&(pRow->b1_and_mask))|(pRow->b1_or_mask); - } - pStr->c += (pRow->trailing_bytes + 1); +#define READ_UTF16LE(zIn, c){ \ + c = (*zIn++); \ + c += ((*zIn++)<<8); \ + if( c>=0xD800 && c<=0xE000 ){ \ + int c2 = (*zIn++); \ + c2 += ((*zIn++)<<8); \ + c = (c2&0x03FF) + ((c&0x003F)<<10) + (((c&0x03C0)+0x0040)<<10); \ + } \ +} - return 0; +#define READ_UTF16BE(zIn, c){ \ + c = ((*zIn++)<<8); \ + c += (*zIn++); \ + if( c>=0xD800 && c<=0xE000 ){ \ + int c2 = ((*zIn++)<<8); \ + c2 += (*zIn++); \ + c = (c2&0x03FF) + ((c&0x003F)<<10) + (((c&0x03C0)+0x0040)<<10); \ + } \ } /* -** Read a single unicode character from the UTF-16 encoded string *pStr. The -** value returned is a unicode scalar value. In the case of malformed -** strings, the unicode replacement character U+FFFD may be returned. -** -** If big_endian is true, the string is assumed to be UTF-16BE encoded. -** Otherwise, it is UTF-16LE encoded. -*/ -static u32 readUtf16(UtfString *pStr, int big_endian){ - u32 code_point; /* the first code-point in the character */ +** If the TRANSLATE_TRACE macro is defined, the value of each Mem is +** printed on stderr on the way into and out of sqlite3VdbeMemTranslate(). +*/ +/* #define TRANSLATE_TRACE 1 */ - /* If there is only one byte of data left in the string, return the - ** replacement character. - */ - if( (pStr->n-pStr->c)==1 ){ - pStr->c++; - return (int)0xFFFD; +/* +** This routine transforms the internal text encoding used by pMem to +** desiredEnc. It is an error if the string is already of the desired +** encoding, or if *pMem does not contain a string value. +*/ +int sqlite3VdbeMemTranslate(Mem *pMem, u8 desiredEnc){ + unsigned char zShort[NBFS]; /* Temporary short output buffer */ + int len; /* Maximum length of output string in bytes */ + unsigned char *zOut; /* Output buffer */ + unsigned char *zIn; /* Input iterator */ + unsigned char *zTerm; /* End of input */ + unsigned char *z; /* Output iterator */ + int c; + + assert( pMem->flags&MEM_Str ); + assert( pMem->enc!=desiredEnc ); + assert( pMem->enc!=0 ); + assert( pMem->n>=0 ); + +#ifdef TRANSLATE_TRACE + { + char zBuf[100]; + sqlite3VdbeMemPrettyPrint(pMem, zBuf, 100); + fprintf(stderr, "INPUT: %s\n", zBuf); } +#endif - code_point = READ_16(&(pStr->pZ[pStr->c]), big_endian); - pStr->c += 2; - - /* If this is a non-surrogate code-point, just cast it to an int and - ** return the code-point value. + /* If the translation is between UTF-16 little and big endian, then + ** all that is required is to swap the byte order. This case is handled + ** differently from the others. */ - if( code_point<0xD800 || code_point>0xE000 ){ - return code_point; + if( pMem->enc!=SQLITE_UTF8 && desiredEnc!=SQLITE_UTF8 ){ + u8 temp; + sqlite3VdbeMemMakeWriteable(pMem); + zIn = pMem->z; + zTerm = &zIn[pMem->n]; + while( zIn<zTerm ){ + temp = *zIn; + *zIn = *(zIn+1); + zIn++; + *zIn++ = temp; + } + pMem->enc = desiredEnc; + goto translate_out; } - /* If this is a trailing surrogate code-point, then the string is - ** malformed; return the replacement character. + /* Set zIn to point at the start of the input buffer and zTerm to point 1 + ** byte past the end. + ** + ** Variable zOut is set to point at the output buffer. This may be space + ** obtained from malloc(), or Mem.zShort, if it large enough and not in + ** use, or the zShort array on the stack (see above). */ - if( code_point>0xDBFF ){ - return 0xFFFD; + zIn = pMem->z; + zTerm = &zIn[pMem->n]; + len = pMem->n*2 + 2; + if( len>NBFS ){ + zOut = sqliteMallocRaw(len); + if( !zOut ) return SQLITE_NOMEM; + }else{ + if( pMem->z==pMem->zShort ){ + zOut = zShort; + }else{ + zOut = pMem->zShort; + } } - - /* The code-point just read is a leading surrogate code-point. If their - ** is not enough data left or the next code-point is not a trailing - ** surrogate, return the replacement character. - */ - if( (pStr->n-pStr->c)>1 ){ - u32 code_point2 = READ_16(&pStr->pZ[pStr->c], big_endian); - if( code_point2<0xDC00 || code_point>0xDFFF ){ - return 0xFFFD; + z = zOut; + + if( pMem->enc==SQLITE_UTF8 ){ + if( desiredEnc==SQLITE_UTF16LE ){ + /* UTF-8 -> UTF-16 Little-endian */ + while( zIn<zTerm ){ + READ_UTF8(zIn, c); + WRITE_UTF16LE(z, c); + } + WRITE_UTF16LE(z, 0); + pMem->n = (z-zOut)-2; + }else if( desiredEnc==SQLITE_UTF16BE ){ + /* UTF-8 -> UTF-16 Big-endian */ + while( zIn<zTerm ){ + READ_UTF8(zIn, c); + WRITE_UTF16BE(z, c); + } + WRITE_UTF16BE(z, 0); + pMem->n = (z-zOut)-2; } - pStr->c += 2; - - return ( - (((code_point&0x03C0)+0x0040)<<16) + /* uuuuu */ - ((code_point&0x003F)<<10) + /* xxxxxx */ - (code_point2&0x03FF) /* yy yyyyyyyy */ - ); + }else{ + assert( desiredEnc==SQLITE_UTF8 ); + if( pMem->enc==SQLITE_UTF16LE ){ + /* UTF-16 Little-endian -> UTF-8 */ + while( zIn<zTerm ){ + READ_UTF16LE(zIn, c); + WRITE_UTF8(z, c); + } + WRITE_UTF8(z, 0); + pMem->n = (z-zOut)-1; + }else{ + /* UTF-16 Little-endian -> UTF-8 */ + while( zIn<zTerm ){ + READ_UTF16BE(zIn, c); + WRITE_UTF8(z, c); + } + WRITE_UTF8(z, 0); + pMem->n = (z-zOut)-1; + } + } + sqlite3VdbeMemRelease(pMem); + pMem->flags &= ~(MEM_Static|MEM_Dyn|MEM_Ephem|MEM_Short); + pMem->enc = desiredEnc; + if( (char *)zOut==pMem->zShort ){ + pMem->flags |= (MEM_Term|MEM_Short); + }else if( zOut==zShort ){ + memcpy(pMem->zShort, zOut, len); + zOut = pMem->zShort; + pMem->flags |= (MEM_Term|MEM_Short); }else{ - return (int)0xFFFD; + pMem->flags |= (MEM_Term|MEM_Dyn); } - - /* not reached */ + pMem->z = zOut; + +translate_out: +#ifdef TRANSLATE_TRACE + { + char zBuf[100]; + sqlite3VdbeMemPrettyPrint(pMem, zBuf, 100); + fprintf(stderr, "OUTPUT: %s\n", zBuf); + } +#endif + return SQLITE_OK; } -static int writeUtf16(UtfString *pStr, int code, int big_endian){ - int bytes; - unsigned char *hi_byte; - unsigned char *lo_byte; - - bytes = (code>0x0000FFFF?4:2); - - /* Ensure there is enough room left in the output buffer to write - ** this UTF-8 character. - */ - assert( (pStr->n-pStr->c)>=bytes ); - - /* Initialise hi_byte and lo_byte to point at the locations into which - ** the MSB and LSB of the (first) 16-bit unicode code-point written for - ** this character. - */ - hi_byte = (big_endian?&pStr->pZ[pStr->c]:&pStr->pZ[pStr->c+1]); - lo_byte = (big_endian?&pStr->pZ[pStr->c+1]:&pStr->pZ[pStr->c]); +/* +** This routine checks for a byte-order mark at the beginning of the +** UTF-16 string stored in *pMem. If one is present, it is removed and +** the encoding of the Mem adjusted. This routine does not do any +** byte-swapping, it just sets Mem.enc appropriately. +** +** The allocation (static, dynamic etc.) and encoding of the Mem may be +** changed by this function. +*/ +int sqlite3VdbeMemHandleBom(Mem *pMem){ + int rc = SQLITE_OK; + u8 bom = 0; - if( bytes==2 ){ - *hi_byte = (u8)((code&0x0000FF00)>>8); - *lo_byte = (u8)(code&0x000000FF); - }else{ - u32 wrd; - wrd = ((((code&0x001F0000)-0x00010000)+(code&0x0000FC00))>>10)|0x0000D800; - *hi_byte = (u8)((wrd&0x0000FF00)>>8); - *lo_byte = (u8)(wrd&0x000000FF); - - wrd = (code&0x000003FF)|0x0000DC00; - *(hi_byte+2) = (u8)((wrd&0x0000FF00)>>8); - *(lo_byte+2) = (u8)(wrd&0x000000FF); + if( pMem->n<0 || pMem->n>1 ){ + u8 b1 = *(u8 *)pMem->z; + u8 b2 = *(((u8 *)pMem->z) + 1); + if( b1==0xFE && b2==0xFF ){ + bom = SQLITE_UTF16BE; + } + if( b1==0xFF && b2==0xFE ){ + bom = SQLITE_UTF16LE; + } } - - pStr->c += bytes; - return 0; + if( bom ){ + if( pMem->flags & MEM_Short ){ + memmove(pMem->zShort, &pMem->zShort[2], NBFS-2); + pMem->n -= 2; + pMem->enc = bom; + } + else if( pMem->flags & MEM_Dyn ){ + void (*xDel)(void*) = pMem->xDel; + char *z = pMem->z; + pMem->z = 0; + pMem->xDel = 0; + rc = sqlite3VdbeMemSetStr(pMem, &z[2], pMem->n-2, bom, SQLITE_TRANSIENT); + if( xDel ){ + xDel(z); + }else{ + sqliteFree(z); + } + }else{ + rc = sqlite3VdbeMemSetStr(pMem, &pMem->z[2], pMem->n-2, bom, + SQLITE_TRANSIENT); + } + } + return rc; } /* @@ -452,22 +404,20 @@ static int writeUtf16(UtfString *pStr, int code, int big_endian){ ** number of unicode characters in the first nByte of pZ (or up to ** the first 0x00, whichever comes first). */ -int sqlite3utf8CharLen(const char *pZ, int nByte){ - UtfString str; - int ret = 0; - u32 code = 1; - - str.pZ = (char *)pZ; - str.n = nByte; - str.c = 0; - - while( (nByte<0 || str.c<str.n) && code!=0 ){ - code = readUtf8(&str); - ret++; +int sqlite3utf8CharLen(const char *z, int nByte){ + int r = 0; + const char *zTerm; + if( nByte>0 ){ + zTerm = &z[nByte]; + }else{ + zTerm = (const char *)(-1); } - if( code==0 ) ret--; - - return ret; + assert( z<=zTerm ); + while( *z!=0 && z<zTerm ){ + SKIP_UTF8(z); + r++; + } + return r; } /* @@ -477,242 +427,25 @@ int sqlite3utf8CharLen(const char *pZ, int nByte){ ** then return the number of bytes in the first nChar unicode characters ** in pZ (or up until the first pair of 0x00 bytes, whichever comes first). */ -int sqlite3utf16ByteLen(const void *pZ, int nChar){ - if( nChar<0 ){ - const unsigned char *pC1 = (unsigned char *)pZ; - const unsigned char *pC2 = (unsigned char *)pZ+1; - while( *pC1 || *pC2 ){ - pC1 += 2; - pC2 += 2; +int sqlite3utf16ByteLen(const void *zIn, int nChar){ + int c = 1; + char const *z = zIn; + int n = 0; + if( SQLITE_UTF16NATIVE==SQLITE_UTF16BE ){ + while( c && ((nChar<0) || n<nChar) ){ + READ_UTF16BE(z, c); + n++; } - return pC1-(unsigned char *)pZ; }else{ - UtfString str; - u32 code = 1; - int big_endian; - int nRead = 0; - int ret; - - str.pZ = (char *)pZ; - str.c = 0; - str.n = -1; - - /* Check for a BOM. We just ignore it if there is one, it's only read - ** so that it is not counted as a character. - */ - big_endian = readUtf16Bom(&str, 0); - ret = 0-str.c; - - while( code!=0 && nRead<nChar ){ - code = readUtf16(&str, big_endian); - nRead++; + while( c && ((nChar<0) || n<nChar) ){ + READ_UTF16LE(z, c); + n++; } - if( code==0 ){ - ret -= 2; - } - return str.c + ret; } + return (z-(char const *)zIn)-((c==0)?2:0); } /* -** Convert a string in UTF-16 native byte (or with a Byte-order-mark or -** "BOM") into a UTF-8 string. The UTF-8 string is written into space -** obtained from sqlite3Malloc() and must be released by the calling function. -** -** The parameter N is the number of bytes in the UTF-16 string. If N is -** negative, the entire string up to the first \u0000 character is translated. -** -** The returned UTF-8 string is always \000 terminated. -*/ -unsigned char *sqlite3utf16to8(const void *pData, int N, int big_endian){ - UtfString in; - UtfString out; - - out.pZ = 0; - - in.pZ = (unsigned char *)pData; - in.n = N; - in.c = 0; - - if( in.n<0 ){ - in.n = sqlite3utf16ByteLen(in.pZ, -1); - } - - /* A UTF-8 encoding of a unicode string can require at most 1.5 times as - ** much space to store as the same string encoded using UTF-16. Allocate - ** this now. - */ - out.n = (in.n*1.5) + 1; - out.pZ = sqliteMalloc(out.n); - if( !out.pZ ){ - return 0; - } - out.c = 0; - - big_endian = readUtf16Bom(&in, big_endian); - while( in.c<in.n ){ - writeUtf8(&out, readUtf16(&in, big_endian)); - } - - /* Add the NULL-terminator character */ - assert( out.c<out.n ); - out.pZ[out.c] = 0x00; - - return out.pZ; -} - -static void *utf8toUtf16(const unsigned char *pIn, int N, int big_endian){ - UtfString in; - UtfString out; - - in.pZ = (unsigned char *)pIn; - in.n = N; - in.c = 0; - - if( in.n<0 ){ - in.n = strlen(in.pZ); - } - - /* A UTF-16 encoding of a unicode string can require at most twice as - ** much space to store as the same string encoded using UTF-8. Allocate - ** this now. - */ - out.n = (in.n*2) + 2; - out.pZ = sqliteMalloc(out.n); - if( !out.pZ ){ - return 0; - } - out.c = 0; - - while( in.c<in.n ){ - writeUtf16(&out, readUtf8(&in), big_endian); - } - - /* Add the NULL-terminator character */ - assert( (out.c+1)<out.n ); - out.pZ[out.c] = 0x00; - out.pZ[out.c+1] = 0x00; - - return out.pZ; -} - -/* -** Translate UTF-8 to UTF-16BE or UTF-16LE -*/ -void *sqlite3utf8to16be(const unsigned char *pIn, int N){ - return utf8toUtf16(pIn, N, 1); -} - -void *sqlite3utf8to16le(const unsigned char *pIn, int N){ - return utf8toUtf16(pIn, N, 0); -} - -/* -** This routine does the work for sqlite3utf16to16le() and -** sqlite3utf16to16be(). If big_endian is 1 the input string is -** transformed in place to UTF-16BE encoding. If big_endian is 0 then -** the input is transformed to UTF-16LE. -** -** Unless the first two bytes of the input string is a BOM, the input is -** assumed to be UTF-16 encoded using the machines native byte ordering. -*/ -static void utf16to16(void *pData, int N, int big_endian){ - UtfString inout; - inout.pZ = (unsigned char *)pData; - inout.c = 0; - inout.n = N; - - if( inout.n<0 ){ - inout.n = sqlite3utf16ByteLen(inout.pZ, -1); - } - - if( readUtf16Bom(&inout, SQLITE_BIGENDIAN)!=big_endian ){ - /* swab(&inout.pZ[inout.c], inout.pZ, inout.n-inout.c); */ - int i; - for(i=0; i<(inout.n-inout.c); i += 2){ - char c1 = inout.pZ[i+inout.c]; - char c2 = inout.pZ[i+inout.c+1]; - inout.pZ[i] = c2; - inout.pZ[i+1] = c1; - } - }else if( inout.c ){ - memmove(inout.pZ, &inout.pZ[inout.c], inout.n-inout.c); - } - - inout.pZ[inout.n-inout.c] = 0x00; - inout.pZ[inout.n-inout.c+1] = 0x00; -} - -/* -** Convert a string in UTF-16 native byte or with a BOM into a UTF-16LE -** string. The conversion occurs in-place. The output overwrites the -** input. N bytes are converted. If N is negative everything is converted -** up to the first \u0000 character. -** -** If the native byte order is little-endian and there is no BOM, then -** this routine is a no-op. If there is a BOM at the start of the string, -** it is removed. -** -** Translation from UTF-16LE to UTF-16BE and back again is accomplished -** using the library function swab(). -*/ -void sqlite3utf16to16le(void *pData, int N){ - utf16to16(pData, N, 0); -} - -/* -** Convert a string in UTF-16 native byte or with a BOM into a UTF-16BE -** string. The conversion occurs in-place. The output overwrites the -** input. N bytes are converted. If N is negative everything is converted -** up to the first \u0000 character. -** -** If the native byte order is little-endian and there is no BOM, then -** this routine is a no-op. If there is a BOM at the start of the string, -** it is removed. -** -** Translation from UTF-16LE to UTF-16BE and back again is accomplished -** using the library function swab(). -*/ -void sqlite3utf16to16be(void *pData, int N){ - utf16to16(pData, N, 1); -} - -/* -** This function is used to translate between UTF-8 and UTF-16. The -** result is returned in dynamically allocated memory. -*/ -int sqlite3utfTranslate( - const void *zData, int nData, /* Input string */ - u8 enc1, /* Encoding of zData */ - void **zOut, int *nOut, /* Output string */ - u8 enc2 /* Desired encoding of output */ -){ - assert( enc1==SQLITE_UTF8 || enc1==SQLITE_UTF16LE || enc1==SQLITE_UTF16BE ); - assert( enc2==SQLITE_UTF8 || enc2==SQLITE_UTF16LE || enc2==SQLITE_UTF16BE ); - assert( - (enc1==SQLITE_UTF8 && (enc2==SQLITE_UTF16LE || enc2==SQLITE_UTF16BE)) || - (enc2==SQLITE_UTF8 && (enc1==SQLITE_UTF16LE || enc1==SQLITE_UTF16BE)) - ); - - if( enc1==SQLITE_UTF8 ){ - if( enc2==SQLITE_UTF16LE ){ - *zOut = sqlite3utf8to16le(zData, nData); - }else{ - *zOut = sqlite3utf8to16be(zData, nData); - } - if( !(*zOut) ) return SQLITE_NOMEM; - *nOut = sqlite3utf16ByteLen(*zOut, -1); - }else{ - *zOut = sqlite3utf16to8(zData, nData, enc1==SQLITE_UTF16BE); - if( !(*zOut) ) return SQLITE_NOMEM; - *nOut = strlen(*zOut); - } - return SQLITE_OK; -} - -#define sqliteNextChar(X) while( (0xc0&*++(X))==0x80 ){} - -/* ** Compare two UTF-8 strings for equality using the "LIKE" operator of ** SQL. The '%' character matches any sequence of 0 or more ** characters and '_' matches any single character. Case is @@ -731,7 +464,7 @@ int sqlite3utf8LikeCompare( while( (c=zPattern[1]) == '%' || c == '_' ){ if( c=='_' ){ if( *zString==0 ) return 0; - sqliteNextChar(zString); + SKIP_UTF8(zString); } zPattern++; } @@ -744,13 +477,13 @@ int sqlite3utf8LikeCompare( } if( c2==0 ) return 0; if( sqlite3utf8LikeCompare(&zPattern[1],zString) ) return 1; - sqliteNextChar(zString); + SKIP_UTF8(zString); } return 0; } case '_': { if( *zString==0 ) return 0; - sqliteNextChar(zString); + SKIP_UTF8(zString); zPattern++; break; } @@ -764,3 +497,50 @@ int sqlite3utf8LikeCompare( } return *zString==0; } + +#ifndef NDEBUG +/* +** This routine is called from the TCL test function "translate_selftest". +** It checks that the primitives for serializing and deserializing +** characters in each encoding are inverses of each other. +*/ +void sqlite3utfSelfTest(){ + int i; + unsigned char zBuf[20]; + unsigned char *z; + int n; + int c; + + for(i=0; 0 && i<0x00110000; i++){ + z = zBuf; + WRITE_UTF8(z, i); + n = z-zBuf; + z = zBuf; + READ_UTF8(z, c); + assert( c==i ); + assert( (z-zBuf)==n ); + } + for(i=0; i<0x00110000; i++){ + if( i>=0xD800 && i<=0xE000 ) continue; + z = zBuf; + WRITE_UTF16LE(z, i); + n = z-zBuf; + z = zBuf; + READ_UTF16LE(z, c); + assert( c==i ); + assert( (z-zBuf)==n ); + } + for(i=0; i<0x00110000; i++){ + if( i>=0xD800 && i<=0xE000 ) continue; + z = zBuf; + WRITE_UTF16BE(z, i); + n = z-zBuf; + z = zBuf; + READ_UTF16BE(z, c); + assert( c==i ); + assert( (z-zBuf)==n ); + } +} +#endif + + diff --git a/src/util.c b/src/util.c index 002bc2d35..29cd56112 100644 --- a/src/util.c +++ b/src/util.c @@ -14,7 +14,7 @@ ** This file contains functions for allocating memory, comparing ** strings, and stuff like that. ** -** $Id: util.c,v 1.102 2004/06/16 07:45:29 danielk1977 Exp $ +** $Id: util.c,v 1.103 2004/06/18 04:24:55 danielk1977 Exp $ */ #include "sqliteInt.h" #include <stdarg.h> @@ -256,6 +256,13 @@ char *sqlite3StrNDup_(const char *z, int n, char *zFile, int line){ } return zNew; } + +/* +** A version of sqliteFree that is always a function, not a macro. +*/ +void sqlite3FreeX(void *p){ + sqliteFree(p); +} #endif /* SQLITE_DEBUG */ /* @@ -446,23 +453,18 @@ void sqlite3SetNString(char **pz, ...){ ** to NULL. */ void sqlite3Error(sqlite *db, int err_code, const char *zFormat, ...){ - /* Free any existing error message. */ - if( db->zErrMsg ){ - sqliteFree(db->zErrMsg); - db->zErrMsg = 0; - } - if( db->zErrMsg16 ){ - sqliteFree(db->zErrMsg16); - db->zErrMsg16 = 0; - } - - /* Set the new error code and error message. */ - db->errCode = err_code; - if( zFormat ){ - va_list ap; - va_start(ap, zFormat); - db->zErrMsg = sqlite3VMPrintf(zFormat, ap); - va_end(ap); + if( db && (db->pErr || (db->pErr = sqlite3ValueNew())) ){ + db->errCode = err_code; + if( zFormat ){ + char *z; + va_list ap; + va_start(ap, zFormat); + z = sqlite3VMPrintf(zFormat, ap); + va_end(ap); + sqlite3ValueSetStr(db->pErr, -1, z, SQLITE_UTF8, sqlite3FreeX); + }else{ + sqlite3ValueSetStr(db->pErr, 0, 0, SQLITE_UTF8, SQLITE_STATIC); + } } } diff --git a/src/vdbe.c b/src/vdbe.c index 979889d65..ae7a38c9a 100644 --- a/src/vdbe.c +++ b/src/vdbe.c @@ -43,7 +43,7 @@ ** in this file for details. If in doubt, do not deviate from existing ** commenting and indentation practices when changing or adding code. ** -** $Id: vdbe.c,v 1.378 2004/06/17 07:53:03 danielk1977 Exp $ +** $Id: vdbe.c,v 1.379 2004/06/18 04:24:55 danielk1977 Exp $ */ #include "sqliteInt.h" #include "os.h" @@ -361,10 +361,12 @@ static void applyAffinity(Mem *pRec, char affinity, u8 enc){ ** Write a nice string representation of the contents of cell pMem ** into buffer zBuf, length nBuf. */ -void prettyPrintMem(Mem *pMem, char *zBuf, int nBuf){ +void sqlite3VdbeMemPrettyPrint(Mem *pMem, char *zBuf, int nBuf){ char *zCsr = zBuf; int f = pMem->flags; + static const char *encnames[] = {"(X)", "(8)", "(16LE)", "(16BE)"}; + if( f&MEM_Blob ){ int i; char c; @@ -414,11 +416,6 @@ void prettyPrintMem(Mem *pMem, char *zBuf, int nBuf){ zBuf[k++] = '['; for(j=0; j<15 && j<pMem->n; j++){ u8 c = pMem->z[j]; -/* - if( c==0 && j==pMem->n-1 ) break; - zBuf[k++] = "0123456789ABCDEF"[c>>4]; - zBuf[k++] = "0123456789ABCDEF"[c&0xf]; -*/ if( c>=0x20 && c<0x7f ){ zBuf[k++] = c; }else{ @@ -426,14 +423,10 @@ void prettyPrintMem(Mem *pMem, char *zBuf, int nBuf){ } } zBuf[k++] = ']'; + k += sprintf(&zBuf[k], encnames[pMem->enc]); zBuf[k++] = 0; } } - -/* Temporary - this is useful in conjunction with prettyPrintMem whilst -** debugging. -*/ -char zGdbBuf[100]; #endif @@ -734,20 +727,20 @@ case OP_String8: { pOp->opcode = OP_String; if( db->enc!=SQLITE_UTF8 && pOp->p3 ){ - char *z = pOp->p3; - if( db->enc==SQLITE_UTF16LE ){ - pOp->p3 = sqlite3utf8to16le(z, -1); - }else{ - pOp->p3 = sqlite3utf8to16be(z, -1); - } + pTos++; + sqlite3VdbeMemSetStr(pTos, pOp->p3, -1, SQLITE_UTF8, SQLITE_STATIC); + if( SQLITE_OK!=sqlite3VdbeChangeEncoding(pTos, db->enc) ) goto no_mem; + if( SQLITE_OK!=sqlite3VdbeMemDynamicify(pTos) ) goto no_mem; + pTos->flags &= ~(MEM_Dyn); + pTos->flags |= MEM_Static; if( pOp->p3type==P3_DYNAMIC ){ - sqliteFree(z); + sqliteFree(pOp->p3); } pOp->p3type = P3_DYNAMIC; - if( !pOp->p3 ) goto no_mem; + pOp->p3 = pTos->z; + break; } - - /* Fall through to the next case, OP_String */ + /* Otherwise fall through to the next case, OP_String */ } /* Opcode: String * * P3 @@ -4590,7 +4583,7 @@ default: { fprintf(p->trace, " r:%g", pTos[i].r); }else{ char zBuf[100]; - prettyPrintMem(&pTos[i], zBuf, 100); + sqlite3VdbeMemPrettyPrint(&pTos[i], zBuf, 100); fprintf(p->trace, " "); fprintf(p->trace, zBuf); } diff --git a/src/vdbeInt.h b/src/vdbeInt.h index ce5244fda..eed4f5a39 100644 --- a/src/vdbeInt.h +++ b/src/vdbeInt.h @@ -390,3 +390,5 @@ void sqlite3VdbeMemRelease(Mem *p); #ifndef NDEBUG void sqlite3VdbeMemSanity(Mem*, u8); #endif +int sqlite3VdbeMemTranslate(Mem*, u8); +void sqlite3VdbeMemPrettyPrint(Mem *pMem, char *zBuf, int nBuf); diff --git a/src/vdbeapi.c b/src/vdbeapi.c index dff0a10ad..fde3b1cd2 100644 --- a/src/vdbeapi.c +++ b/src/vdbeapi.c @@ -518,20 +518,7 @@ int sqlite3_bind_text16( } pVar = &p->apVar[i-1]; - /* There may or may not be a byte order mark at the start of the UTF-16. - ** Either way set 'txt_enc' to the SQLITE_UTF16* value indicating the - ** actual byte order used by this string. If the string does happen - ** to contain a BOM, then move zData so that it points to the first - ** byte after the BOM. - */ - txt_enc = sqlite3UtfReadBom(zData, nData); - if( txt_enc ){ - zData = (void *)(((u8 *)zData) + 2); - nData -= 2; - }else{ - txt_enc = SQLITE_BIGENDIAN?SQLITE_UTF16BE:SQLITE_UTF16LE; - } - rc = sqlite3VdbeMemSetStr(pVar, zData, nData, txt_enc, xDel); + rc = sqlite3VdbeMemSetStr(pVar, zData, nData, SQLITE_UTF16NATIVE, xDel); if( rc ){ return rc; } diff --git a/src/vdbemem.c b/src/vdbemem.c index 6becf6f87..8c5891dec 100644 --- a/src/vdbemem.c +++ b/src/vdbemem.c @@ -21,63 +21,23 @@ #include "vdbeInt.h" /* -** If pMem is a string object, this routine sets the encoding of the string -** (to one of UTF-8 or UTF16) and whether or not the string is -** nul-terminated. If pMem is not a string object, then this routine is -** a no-op. +** If pMem is an object with a valid string representation, this routine +** ensures the internal encoding for the string representation is +** 'desiredEnc', one of SQLITE_UTF8, SQLITE_UTF16LE or SQLITE_UTF16BE. ** -** The second argument, "desiredEnc" is one of TEXT_Utf8, TEXT_Utf16le -** or TEXT_Utf16be. This routine changes the encoding of pMem to match -** desiredEnc. +** If pMem is not a string object, or the encoding of the string +** representation is already stored using the requested encoding, then this +** routine is a no-op. ** ** SQLITE_OK is returned if the conversion is successful (or not required). ** SQLITE_NOMEM may be returned if a malloc() fails during conversion ** between formats. */ int sqlite3VdbeChangeEncoding(Mem *pMem, int desiredEnc){ - /* If this is not a string, or if it is a string but the encoding is - ** already correct, do nothing. */ if( !(pMem->flags&MEM_Str) || pMem->enc==desiredEnc ){ return SQLITE_OK; } - - if( pMem->enc==SQLITE_UTF8 || desiredEnc==SQLITE_UTF8 ){ - /* If the current encoding does not match the desired encoding, then - ** we will need to do some translation between encodings. - */ - char *z; - int n; - int rc; - - rc = sqlite3utfTranslate(pMem->z, pMem->n, pMem->enc, (void **)&z, - &n, desiredEnc); - if( rc!=SQLITE_OK ){ - return rc; - } - sqlite3VdbeMemRelease(pMem); - - /* Result of sqlite3utfTranslate is currently always dynamically - ** allocated and nul terminated. This might be altered as a performance - ** enhancement later. - */ - pMem->z = z; - pMem->n = n; - pMem->flags &= ~(MEM_Ephem | MEM_Short | MEM_Static); - pMem->flags |= MEM_Str | MEM_Dyn | MEM_Term; - pMem->xDel = 0; - }else{ - /* Must be translating between UTF-16le and UTF-16be. */ - int i; - u8 *pFrom, *pTo; - sqlite3VdbeMemMakeWriteable(pMem); - for(i=0, pFrom=pMem->z, pTo=&pMem->z[1]; i<pMem->n; i+=2, pFrom+=2,pTo+=2){ - u8 temp = *pFrom; - *pFrom = *pTo; - *pTo = temp; - } - } - pMem->enc = desiredEnc; - return SQLITE_OK; + return sqlite3VdbeMemTranslate(pMem, desiredEnc); } /* @@ -405,16 +365,19 @@ int sqlite3VdbeMemSetStr( case SQLITE_UTF16LE: case SQLITE_UTF16BE: pMem->flags |= MEM_Str; - if( n<0 ){ - pMem->n = sqlite3utf16ByteLen(z,-1); + if( pMem->n<0 ){ + pMem->n = sqlite3utf16ByteLen(pMem->z,-1); pMem->flags |= MEM_Term; } + if( sqlite3VdbeMemHandleBom(pMem) ){ + return SQLITE_NOMEM; + } break; default: assert(0); } - if( xDel==SQLITE_TRANSIENT ){ + if( pMem->flags&MEM_Ephem ){ return sqlite3VdbeMemMakeWriteable(pMem); } return SQLITE_OK; @@ -498,11 +461,9 @@ int sqlite3MemCompare(const Mem *pMem1, const Mem *pMem2, const CollSeq *pColl){ assert( pMem1->enc==SQLITE_UTF8 || pMem1->enc==SQLITE_UTF16LE || pMem1->enc==SQLITE_UTF16BE ); - /* FIX ME: This may fail if the collation sequence is deleted after - ** this vdbe program is compiled. We cannot just use BINARY in this - ** case as this may lead to a segfault caused by traversing an index - ** table incorrectly. We need to return an error to the user in this - ** case. + /* This assert may fail if the collation sequence is deleted after this + ** vdbe program is compiled. The documentation defines this as an + ** undefined condition. A crash is usual result. */ assert( !pColl || pColl->xCmp ); @@ -645,22 +606,17 @@ void sqlite3VdbeMemSanity(Mem *pMem, u8 db_enc){ ** SQLITE_UTF8. */ const void *sqlite3ValueText(sqlite3_value* pVal, u8 enc){ + if( !pVal ) return 0; assert( enc==SQLITE_UTF16LE || enc==SQLITE_UTF16BE || enc==SQLITE_UTF8); + if( pVal->flags&MEM_Null ){ - /* For a NULL return a NULL Pointer */ return 0; } - if( pVal->flags&MEM_Str ){ - /* If there is already a string representation, make sure it is in - ** encoded in the required UTF-16 byte order. - */ sqlite3VdbeChangeEncoding(pVal, enc); }else if( !(pVal->flags&MEM_Blob) ){ - /* Otherwise, unless this is a blob, convert it to a UTF-16 string */ sqlite3VdbeMemStringify(pVal, enc); } - return (const void *)(pVal->z); } @@ -673,12 +629,19 @@ sqlite3_value* sqlite3ValueNew(){ return p; } -void sqlite3ValueSetStr(sqlite3_value *v, int n, const void *z, u8 enc){ - sqlite3VdbeMemSetStr((Mem *)v, z, n, enc, SQLITE_STATIC); +void sqlite3ValueSetStr( + sqlite3_value *v, + int n, + const void *z, + u8 enc, + void (*xDel)(void*) +){ + if( v ) sqlite3VdbeMemSetStr((Mem *)v, z, n, enc, xDel); } void sqlite3ValueFree(sqlite3_value *v){ - sqlite3ValueSetStr(v, 0, 0, SQLITE_UTF8); + if( !v ) return; + sqlite3ValueSetStr(v, 0, 0, SQLITE_UTF8, SQLITE_STATIC); sqliteFree(v); } |