From 998b56c351e56cb9455d667b740ec0c6ead4b48d Mon Sep 17 00:00:00 2001 From: danielk1977 Date: Thu, 6 May 2004 23:37:52 +0000 Subject: Add code to convert between the various supported unicode encoding schemes. Untested at this point. (CVS 1315) FossilOrigin-Name: 71260ff7f7030f56c292b43f83a213c65c9a184e --- src/sqliteInt.h | 9 +- src/tclsqlite.c | 4 +- src/test5.c | 196 +++++++++++++++++++++++++ src/utf.c | 440 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-- 4 files changed, 635 insertions(+), 14 deletions(-) create mode 100644 src/test5.c (limited to 'src') diff --git a/src/sqliteInt.h b/src/sqliteInt.h index c3e3de6c0..abf3d8436 100644 --- a/src/sqliteInt.h +++ b/src/sqliteInt.h @@ -11,7 +11,7 @@ ************************************************************************* ** Internal interface definitions for SQLite. ** -** @(#) $Id: sqliteInt.h,v 1.221 2004/04/26 14:10:22 drh Exp $ +** @(#) $Id: sqliteInt.h,v 1.222 2004/05/06 23:37:53 danielk1977 Exp $ */ #include "config.h" #include "sqlite.h" @@ -1268,3 +1268,10 @@ int sqliteFixTriggerStep(DbFixer*, TriggerStep*); double sqliteAtoF(const char *z, const char **); char *sqlite_snprintf(int,char*,const char*,...); int sqliteFitsIn32Bits(const char *); + +unsigned char *sqlite3utf16to8(const void *pData, int N); +void *sqlite3utf8to16be(const unsigned char *pIn, int N); +void *sqlite3utf8to16le(const unsigned char *pIn, int N); +void sqlite3utf16to16le(void *pData, int N); +void sqlite3utf16to16be(void *pData, int N); + diff --git a/src/tclsqlite.c b/src/tclsqlite.c index e1ca48373..73c1ea4b3 100644 --- a/src/tclsqlite.c +++ b/src/tclsqlite.c @@ -11,7 +11,7 @@ ************************************************************************* ** A TCL Interface to SQLite ** -** $Id: tclsqlite.c,v 1.60 2004/04/26 14:10:22 drh Exp $ +** $Id: tclsqlite.c,v 1.61 2004/05/06 23:37:53 danielk1977 Exp $ */ #ifndef NO_TCL /* Omit this whole file if TCL is unavailable */ @@ -1208,11 +1208,13 @@ int TCLSH_MAIN(int argc, char **argv){ extern int Sqlitetest2_Init(Tcl_Interp*); extern int Sqlitetest3_Init(Tcl_Interp*); extern int Sqlitetest4_Init(Tcl_Interp*); + extern int Sqlitetest5_Init(Tcl_Interp*); extern int Md5_Init(Tcl_Interp*); /* Sqlitetest1_Init(interp); */ Sqlitetest2_Init(interp); /* Sqlitetest3_Init(interp); */ /* Sqlitetest4_Init(interp); */ + Sqlitetest5_Init(interp); Md5_Init(interp); } #endif diff --git a/src/test5.c b/src/test5.c new file mode 100644 index 000000000..aa8cc26f9 --- /dev/null +++ b/src/test5.c @@ -0,0 +1,196 @@ +/* +** 2001 September 15 +** +** The author disclaims copyright to this source code. In place of +** a legal notice, here is a blessing: +** +** May you do good and not evil. +** May you find forgiveness for yourself and forgive others. +** May you share freely, never taking more than you give. +** +************************************************************************* +** Code for testing the utf.c module in SQLite. This code +** is not included in the SQLite library. It is used for automated +** testing of the SQLite library. +** +** $Id: +*/ +#include "sqliteInt.h" +#include "tcl.h" +#include +#include + +/* +** Return the number of bytes up to and including the first \u0000 +** character in *pStr. +*/ +static int utf16_length(const unsigned char *pZ){ + const unsigned char *pC1 = pZ; + const unsigned char *pC2 = pZ+1; + while( *pC1 || *pC2 ){ + pC1 += 2; + pC2 += 2; + } + return (pC1-pZ)+2; +} + +static int sqlite_utf8to16le( + void * clientData, + Tcl_Interp *interp, + int objc, + Tcl_Obj *CONST objv[] +){ + unsigned char *out; + unsigned char *in; + Tcl_Obj *res; + + if( objc!=2 ){ + Tcl_AppendResult(interp, "wrong # args: should be \"", + Tcl_GetStringFromObj(objv[0], 0), "", 0); + return TCL_ERROR; + } + + in = Tcl_GetByteArrayFromObj(objv[1], 0); + out = (unsigned char *)sqlite3utf8to16le(in, -1); + res = Tcl_NewByteArrayObj(out, utf16_length(ret)); + sqliteFree(out); + + Tcl_SetObjResult(interp, res); + + return TCL_OK; +} + +static int sqlite_utf8to16be( + void * clientData, + Tcl_Interp *interp, + int objc, + Tcl_Obj *CONST objv[] +){ + unsigned char *out; + unsigned char *in; + Tcl_Obj *res; + + if( objc!=2 ){ + Tcl_AppendResult(interp, "wrong # args: should be \"", + Tcl_GetStringFromObj(objv[0], 0), "", 0); + return TCL_ERROR; + } + + in = Tcl_GetByteArrayFromObj(objv[1], 0); + out = (unsigned char *)sqlite3utf8to16be(in, -1); + res = Tcl_NewByteArrayObj(out, utf16_length(ret)); + sqliteFree(out); + + Tcl_SetObjResult(interp, res); + + return TCL_OK; +} + +static int sqlite_utf16to16le( + void * clientData, + Tcl_Interp *interp, + int objc, + Tcl_Obj *CONST objv[] +){ + unsigned char *out; + unsigned char *in; + int in_len; + Tcl_Obj *res; + + if( objc!=2 ){ + Tcl_AppendResult(interp, "wrong # args: should be \"", + Tcl_GetStringFromObj(objv[0], 0), "", 0); + return TCL_ERROR; + } + + in = Tcl_GetByteArrayFromObj(objv[1], &in_len); + out = (unsigned char *)sqliteMalloc(in_len); + memcpy(out, in, in_len); + + sqlite3utf16to16le(out, -1); + res = Tcl_NewByteArrayObj(out, utf16_length(ret)); + sqliteFree(out); + + Tcl_SetObjResult(interp, res); + + return TCL_OK; +} + +static int sqlite_utf16to16be( + void * clientData, + Tcl_Interp *interp, + int objc, + Tcl_Obj *CONST objv[] +){ + unsigned char *out; + unsigned char *in; + int in_len; + Tcl_Obj *res; + + if( objc!=2 ){ + Tcl_AppendResult(interp, "wrong # args: should be \"", + Tcl_GetStringFromObj(objv[0], 0), "", 0); + return TCL_ERROR; + } + + in = Tcl_GetByteArrayFromObj(objv[1], &in_len); + out = (unsigned char *)sqliteMalloc(in_len); + memcpy(out, in, in_len); + + sqlite3utf16to16be(out, -1); + res = Tcl_NewByteArrayObj(out, utf16_length(ret)); + sqliteFree(out); + + Tcl_SetObjResult(interp, res); + + return TCL_OK; +} + +static int sqlite_utf16to8( + void * clientData, + Tcl_Interp *interp, + int objc, + Tcl_Obj *CONST objv[] +){ + unsigned char *out; + unsigned char *in; + Tcl_Obj *res; + + if( objc!=2 ){ + Tcl_AppendResult(interp, "wrong # args: should be \"", + Tcl_GetStringFromObj(objv[0], 0), "", 0); + return TCL_ERROR; + } + + in = Tcl_GetByteArrayFromObj(objv[1], 0); + out = sqlite3utf16to8(in, -1); + res = Tcl_NewByteArrayObj(out, strlen(ret)); + sqliteFree(out); + + Tcl_SetObjResult(interp, res); + + return TCL_OK; +} + + +/* +** Register commands with the TCL interpreter. +*/ +int Sqlitetest5_Init(Tcl_Interp *interp){ + static struct { + char *zName; + Tcl_CmdProc *xProc; + } aCmd[] = { + { "sqlite_utf16to8", (Tcl_CmdProc*)sqlite_utf16to8 }, + { "sqlite_utf8to16le", (Tcl_CmdProc*)sqlite_utf8to16le }, + { "sqlite_utf8to16be", (Tcl_CmdProc*)sqlite_utf8to16be }, + { "sqlite_utf16to16le", (Tcl_CmdProc*)sqlite_utf16to16le }, + { "sqlite_utf16to16be", (Tcl_CmdProc*)sqlite_utf16to16be } + }; + int i; + for(i=0; i +#include +#include "sqliteInt.h" + +typedef struct UtfString UtfString; +struct UtfString { + unsigned char *pZ; /* Raw string data */ + int n; /* Allocated length of pZ in bytes */ + int c; /* Number of pZ bytes already read or written */ +}; + +/* TODO: Implement this macro in os.h. It should be 1 on big-endian +** machines, and 0 on little-endian. +*/ +#define SQLITE3_NATIVE_BIGENDIAN 0 + +#if SQLITE3_NATIVE_BIGENDIAN == 1 +#define BOM_BIGENDIAN 0x0000FFFE +#define BOM_LITTLEENDIAN 0x0000FEFF +#else +#define BOM_BIGENDIAN 0x0000FEFF +#define BOM_LITTLEENDIAN 0x0000FFFE +#endif + +/* +** These two macros are used to interpret the first two bytes of the +** unsigned char array pZ as a 16-bit unsigned int. BE16() for a big-endian +** interpretation, LE16() for little-endian. +*/ +#define BE16(pZ) (((u16)((pZ)[0])<<8) + (u16)((pZ)[1])) +#define LE16(pZ) (((u16)((pZ)[1])<<8) + (u16)((pZ)[0])) + +/* +** READ_16 interprets the first two bytes of the unsigned char array pZ +** as a 16-bit unsigned int. If big_endian is non-zero the intepretation +** is big-endian, otherwise little-endian. +*/ +#define READ_16(pZ,big_endian) (big_endian?BE16(pZ):LE16(pZ)) + +/* +** Read the BOM from the start of *pStr, if one is present. Return zero +** for little-endian, non-zero for big-endian. If no BOM is present, return +** the machines native byte order. +** +** Return values: +** 1 -> big-endian string +** 0 -> little-endian string +*/ +static int readUtf16Bom(UtfString *pStr){ + /* The BOM must be the first thing read from the string */ + assert( pStr->c==0 ); + + /* If the string data consists of 1 byte or less, the BOM will make no + ** difference anyway. In this case just fall through to the default case + ** and return the native byte-order for this machine. + ** + ** Otherwise, check the first 2 bytes of the string to see if a BOM is + ** present. + */ + if( pStr->n>1 ){ + u32 bom = BE16(pStr->pZ); + if( bom==BOM_BIGENDIAN ){ + pStr->c = 2; + return 1; + } + if( bom==BOM_LITTLEENDIAN ){ + pStr->c = 2; + return 0; + } + } + + return SQLITE3_NATIVE_BIGENDIAN; +} + + +/* +** Read a single unicode character from the UTF-8 encoded string *pStr. The +** value returned is a unicode scalar value. In the case of malformed +** strings, the unicode replacement character U+FFFD may be returned. +*/ +static u32 readUtf8(UtfString *pStr){ + struct Utf8TblRow { + u8 b1_mask; + u8 b1_masked_val; + u8 b1_value_mask; + int trailing_bytes; + }; + static const struct Utf8TblRow utf8tbl[] = { + { 0x80, 0x00, 0x7F, 0 }, + { 0xE0, 0xC0, 0x1F, 1 }, + { 0xF0, 0xE0, 0x0F, 2 }, + { 0xF8, 0xF0, 0x0E, 3 }, + { 0, 0, 0, 0} + }; + + u8 b1; /* First byte of the potentially multi-byte utf-8 character */ + u32 ret = 0; /* Return value */ + int ii; + struct Utf8TblRow const *pRow; + + pRow = &(utf8tbl[0]); + + b1 = pStr->pZ[pStr->c]; + pStr->c++; + while( pRow->b1_mask && (b1&pRow->b1_mask)!=pRow->b1_masked_val ){ + pRow++; + } + if( !pRow->b1_mask ){ + return 0xFFFD; + } + + ret = (u32)(b1&pRow->b1_value_mask); + for( ii=0; iitrailing_bytes; ii++ ){ + u8 b = pStr->pZ[pStr->c+ii]; + if( (b&0xC0)!=0x80 ){ + return 0xFFFD; + } + ret = (ret<<6) + (u32)(b&0x3F); + } + + pStr->c += pRow->trailing_bytes; + return ret; +} + +/* +** Write the unicode character 'code' to the string pStr using UTF-8 +** encoding. SQLITE_NOMEM may be returned if sqlite3Malloc() fails. */ +static int writeUtf8(UtfString *pStr, u32 code){ + struct Utf8WriteTblRow { + u32 max_code; + int trailing_bytes; + u8 b1_and_mask; + u8 b1_or_mask; + }; + static const struct Utf8WriteTblRow utf8tbl[] = { + {0x0000007F, 0, 0x7F, 0x00}, + {0x000007FF, 1, 0xDF, 0xC0}, + {0x0000FFFF, 2, 0xEF, 0xE0}, + {0x0010FFFF, 3, 0xF7, 0xF0}, + {0x00000000, 0, 0x00, 0x00} + }; + static const struct Utf8WriteTblRow *pRow = &utf8tbl[0]; + + while( code<=pRow->max_code ){ + assert( pRow->max_code ); + pRow++; + } + + /* Ensure there is enough room left in the output buffer to write + ** this UTF-8 character. + */ + assert( (pStr->n-pStr->c)>=(pRow->trailing_bytes+1) ); + + /* Write the UTF-8 encoded character to pStr. All cases below are + ** intentionally fall-through. + */ + switch( pRow->trailing_bytes ){ + case 3: + pStr->pZ[pStr->c+3] = (((u8)code)&0x3F)|0x80; + code = code>>6; + case 2: + pStr->pZ[pStr->c+2] = (((u8)code)&0x3F)|0x80; + code = code>>6; + case 1: + pStr->pZ[pStr->c+1] = (((u8)code)&0x3F)|0x80; + code = code>>6; + case 0: + pStr->pZ[pStr->c] = (((u8)code)&(pRow->b1_and_mask))|(pRow->b1_or_mask); + } + pStr->c += (pRow->trailing_bytes + 1); + + return 0; +} + +/* +** Read a single unicode character from the UTF-16 encoded string *pStr. The +** value returned is a unicode scalar value. In the case of malformed +** strings, the unicode replacement character U+FFFD may be returned. +** +** If big_endian is true, the string is assumed to be UTF-16BE encoded. +** Otherwise, it is UTF-16LE encoded. +*/ +static u32 readUtf16(UtfString *pStr, int big_endian){ + u32 code_point; /* the first code-point in the character */ + + /* If there is only one byte of data left in the string, return the + ** replacement character. + */ + if( (pStr->n-pStr->c)==1 ){ + pStr->c++; + return (int)0xFFFD; + } + + code_point = READ_16(&(pStr->pZ[pStr->c]), big_endian); + pStr->c += 2; + + /* If this is a non-surrogate code-point, just cast it to an int and + ** return the code-point value. + */ + if( code_point<0xD800 || code_point>0xE000 ){ + return code_point; + } + + /* If this is a trailing surrogate code-point, then the string is + ** malformed; return the replacement character. + */ + if( code_point>0xDBFF ){ + return 0xFFFD; + } + + /* The code-point just read is a leading surrogate code-point. If their + ** is not enough data left or the next code-point is not a trailing + ** surrogate, return the replacement character. + */ + if( (pStr->n-pStr->c)>1 ){ + u32 code_point2 = READ_16(&pStr->pZ[pStr->c], big_endian); + if( code_point2<0xDC00 || code_point>0xDFFF ){ + return 0xFFFD; + } + pStr->c += 2; + + return ( + (((code_point&0x03C0)+0x0040)<<16) + /* uuuuu */ + ((code_point&0x003F)<<10) + /* xxxxxx */ + (code_point2&0x03FF) /* yy yyyyyyyy */ + ); + + }else{ + return (int)0xFFFD; + } + + /* not reached */ +} + +static int writeUtf16(UtfString *pStr, int code, int big_endian){ + int bytes; + unsigned char *hi_byte; + unsigned char *lo_byte; + + bytes = (code>0x0000FFFF?4:2); + + /* Ensure there is enough room left in the output buffer to write + ** this UTF-8 character. + */ + assert( (pStr->n-pStr->c)>=bytes ); + + /* Initialise hi_byte and lo_byte to point at the locations into which + ** the MSB and LSB of the (first) 16-bit unicode code-point written for + ** this character. + */ + hi_byte = (big_endian?&pStr->pZ[pStr->c]:&pStr->pZ[pStr->c+1]); + lo_byte = (big_endian?&pStr->pZ[pStr->c+1]:&pStr->pZ[pStr->c]); + + if( bytes==2 ){ + *hi_byte = (u8)((code&0x0000FF00)>>8); + *lo_byte = (u8)(code&0x000000FF); + }else{ + u32 wrd; + wrd = ((((code&0x001F0000)-0x00010000)+(code&0x0000FC00))>>10)|0x0000D800; + *hi_byte = (u8)((wrd&0x0000FF00)>>8); + *lo_byte = (u8)(wrd&0x000000FF); + + wrd = (code&0x000003FF)|0x0000DC00; + *(hi_byte+2) = (u8)((wrd&0x0000FF00)>>8); + *(lo_byte+2) = (u8)(wrd&0x000000FF); + } + + pStr->c += bytes; + + return 0; +} + +/* +** Return the number of bytes up to (but not including) the first \u0000 +** character in *pStr. +*/ +static int utf16Bytelen(const unsigned char *pZ){ + const unsigned char *pC1 = pZ; + const unsigned char *pC2 = pZ+1; + while( *pC1 || *pC2 ){ + pC1 += 2; + pC2 += 2; + } + return pC1-pZ; +} /* ** Convert a string in UTF-16 native byte (or with a Byte-order-mark or ** "BOM") into a UTF-8 string. The UTF-8 string is written into space -** obtained from sqlit3Malloc() and must be released by the calling function. +** obtained from sqlite3Malloc() and must be released by the calling function. ** ** The parameter N is the number of bytes in the UTF-16 string. If N is ** negative, the entire string up to the first \u0000 character is translated. @@ -45,7 +348,113 @@ ** The returned UTF-8 string is always \000 terminated. */ unsigned char *sqlite3utf16to8(const void *pData, int N){ - unsigned char *in = (unsigned char *)pData; + UtfString in; + UtfString out; + int big_endian; + + out.pZ = 0; + + in.pZ = (unsigned char *)pData; + in.n = N; + in.c = 0; + + if( in.n<0 ){ + in.n = utf16Bytelen(in.pZ); + } + + /* A UTF-8 encoding of a unicode string can require at most 1.5 times as + ** much space to store as the same string encoded using UTF-16. Allocate + ** this now. + */ + out.n = (in.n*1.5) + 1; + out.pZ = sqliteMalloc(in.n); + if( !out.pZ ){ + return 0; + } + out.c = 0; + + big_endian = readUtf16Bom(&in); + while( in.c