diff --git a/main.mk b/main.mk index 120167bd0f..cd3a94d658 100644 --- a/main.mk +++ b/main.mk @@ -55,7 +55,7 @@ TCCX = $(TCC) $(OPTS) $(THREADSAFE) $(USLEEP) -I. -I$(TOP)/src # Object files for the SQLite library. # LIBOBJ = hash.o os.o pager.o random.o \ - util.o tclsqlite.o + util.o tclsqlite.o utf.o LIBOBJ_ORIG = attach.o auth.o btree.o btree_rb.o build.o copy.o date.o delete.o \ expr.o func.o hash.o insert.o \ @@ -121,6 +121,7 @@ TESTSRC = \ $(TOP)/src/os.c \ $(TOP)/src/pager.c \ $(TOP)/src/test2.c \ + $(TOP)/src/test5.c \ $(TOP)/src/md5.c TESTSRC_ORIG = \ @@ -230,6 +231,9 @@ os.o: $(TOP)/src/os.c $(HDR) parse.o: parse.c $(HDR) $(TCCX) -c parse.c +utf.o: $(TOP)/src/utf.c $(HDR) + $(TCCX) -c $(TOP)/src/utf.c + parse.h: parse.c parse.c: $(TOP)/src/parse.y lemon diff --git a/manifest b/manifest index 412cbd14bf..9f9bb35a70 100644 --- a/manifest +++ b/manifest @@ -1,5 +1,5 @@ -C Update\stest3.c\sto\swork\swith\sthe\snew\sbtree.c\sAPI.\s(CVS\s1314) -D 2004-05-04T17:27:28 +C Add\scode\sto\sconvert\sbetween\sthe\svarious\ssupported\sunicode\sencoding\sschemes.\nUntested\sat\sthis\spoint.\s(CVS\s1315) +D 2004-05-06T23:37:52 F Makefile.in ab7b0d5118e2da97bac66be8684a1034e3500f5a F Makefile.linux-gcc b86a99c493a5bfb402d1d9178dcdc4bd4b32f906 F README f1de682fbbd94899d50aca13d387d1b3fd3be2dd @@ -15,7 +15,7 @@ F doc/lemon.html f0f682f50210928c07e562621c3b7e8ab912a538 F doc/report1.txt a031aaf37b185e4fa540223cb516d3bccec7eeac F install-sh 9d4de14ab9fb0facae2f48780b874848cbf2f895 F ltmain.sh f6b283068efa69f06eb8aa1fe4bddfdbdeb35826 -F main.mk 1318f38db512abb0abdbdf4d3c9dd213e2960977 +F main.mk b0b22dffdd728c3a96c2b2e87e01fe86a017fa34 F publish.sh 1cd5c982388560fa91eedf6a338e210f713b35c8 F spec.template a38492f1c1dd349fc24cb0565e08afc53045304b F sqlite.1 83f4a9d37bdf2b7ef079a82d54eaf2e3509ee6ea @@ -49,17 +49,18 @@ F src/random.c 775913e0b7fbd6295d21f12a7bd35b46387c44b2 F src/select.c 3833e2b64cc6d249385ee44e13bf49c9ae5b903d F src/shell.c 920af040d3a33ea8919c82cee45b424ad841cee0 F src/sqlite.h.in 35bec264dfb4965bbfeb7e75221f8658f210c30d -F src/sqliteInt.h 36c649d7f2ab0affdc44e51c681dd1d5723ee1e9 +F src/sqliteInt.h dca24187ffe06e9207f9f400a2a0632ea483e55b F src/table.c d845cb101b5afc1f7fea083c99e3d2fa7998d895 -F src/tclsqlite.c e816201db3ea6ba857a0351547be1d4b7286e95d +F src/tclsqlite.c 3a5f8192ed5279a68610102a92a3a9f0cdd09e68 F src/test1.c 9aa62b89d420e6763b5e7ae89a47f6cf87370477 F src/test2.c 9d611c45e1b07039a2bd95f5ea73178362b23229 F src/test3.c 7d06add423e4a90ec1a2e8d02006f82081109558 F src/test4.c 6e3e31acfaf21d66420fc35fda5b17dc0000cc8d +F src/test5.c 8a3dd24fa84b497243014f92c6afa1ee5ac735b6 F src/tokenize.c 6676b946fd8825b67ab52140af4fdc57a70bda48 F src/trigger.c a9927b57c865b6f3df3fb5e40c9824d722660ded F src/update.c 4c50328ebc127852bde8e2950eb8933234802c21 -F src/utf.c 8d74ddbfffdc1f2e87bfc11b8c1e2a806313a715 +F src/utf.c 1f2ac0c4247258196ce97575144e7793a46be2cd F src/util.c b2287b07ddf55ef7aaa8888a9473123995a69f40 F src/vacuum.c a4e8464c9f6d60659c5343e9d62c742463227820 F src/vdbe.c 7c33f761fdc799633468766fb53eda4301daa6b3 @@ -189,7 +190,7 @@ F www/sqlite.tcl 3c83b08cf9f18aa2d69453ff441a36c40e431604 F www/tclsqlite.tcl b9271d44dcf147a93c98f8ecf28c927307abd6da F www/vdbe.tcl 9b9095d4495f37697fd1935d10e14c6015e80aa1 F www/whentouse.tcl a8335bce47cc2fddb07f19052cb0cb4d9129a8e4 -P 89b42c468f437003f74a1785370e75b2585fa9e2 -R a73f1cc52bfc16883d27a39f1dbd6caa -U drh -Z 27226ab3419af317804aa8669b342fd3 +P bfb3234dc60c9152fdf0a16b887089365443f5ed +R f2be046f24e5df4901763f3eff8c0a1b +U danielk1977 +Z 887f209fe103a2fa5cebdc4c0fa96afc diff --git a/manifest.uuid b/manifest.uuid index 4bb39156a4..8296788a08 100644 --- a/manifest.uuid +++ b/manifest.uuid @@ -1 +1 @@ -bfb3234dc60c9152fdf0a16b887089365443f5ed \ No newline at end of file +71260ff7f7030f56c292b43f83a213c65c9a184e \ No newline at end of file diff --git a/src/sqliteInt.h b/src/sqliteInt.h index c3e3de6c07..abf3d8436f 100644 --- a/src/sqliteInt.h +++ b/src/sqliteInt.h @@ -11,7 +11,7 @@ ************************************************************************* ** Internal interface definitions for SQLite. ** -** @(#) $Id: sqliteInt.h,v 1.221 2004/04/26 14:10:22 drh Exp $ +** @(#) $Id: sqliteInt.h,v 1.222 2004/05/06 23:37:53 danielk1977 Exp $ */ #include "config.h" #include "sqlite.h" @@ -1268,3 +1268,10 @@ int sqliteFixTriggerStep(DbFixer*, TriggerStep*); double sqliteAtoF(const char *z, const char **); char *sqlite_snprintf(int,char*,const char*,...); int sqliteFitsIn32Bits(const char *); + +unsigned char *sqlite3utf16to8(const void *pData, int N); +void *sqlite3utf8to16be(const unsigned char *pIn, int N); +void *sqlite3utf8to16le(const unsigned char *pIn, int N); +void sqlite3utf16to16le(void *pData, int N); +void sqlite3utf16to16be(void *pData, int N); + diff --git a/src/tclsqlite.c b/src/tclsqlite.c index e1ca483736..73c1ea4b30 100644 --- a/src/tclsqlite.c +++ b/src/tclsqlite.c @@ -11,7 +11,7 @@ ************************************************************************* ** A TCL Interface to SQLite ** -** $Id: tclsqlite.c,v 1.60 2004/04/26 14:10:22 drh Exp $ +** $Id: tclsqlite.c,v 1.61 2004/05/06 23:37:53 danielk1977 Exp $ */ #ifndef NO_TCL /* Omit this whole file if TCL is unavailable */ @@ -1208,11 +1208,13 @@ int TCLSH_MAIN(int argc, char **argv){ extern int Sqlitetest2_Init(Tcl_Interp*); extern int Sqlitetest3_Init(Tcl_Interp*); extern int Sqlitetest4_Init(Tcl_Interp*); + extern int Sqlitetest5_Init(Tcl_Interp*); extern int Md5_Init(Tcl_Interp*); /* Sqlitetest1_Init(interp); */ Sqlitetest2_Init(interp); /* Sqlitetest3_Init(interp); */ /* Sqlitetest4_Init(interp); */ + Sqlitetest5_Init(interp); Md5_Init(interp); } #endif diff --git a/src/test5.c b/src/test5.c new file mode 100644 index 0000000000..aa8cc26f98 --- /dev/null +++ b/src/test5.c @@ -0,0 +1,196 @@ +/* +** 2001 September 15 +** +** The author disclaims copyright to this source code. In place of +** a legal notice, here is a blessing: +** +** May you do good and not evil. +** May you find forgiveness for yourself and forgive others. +** May you share freely, never taking more than you give. +** +************************************************************************* +** Code for testing the utf.c module in SQLite. This code +** is not included in the SQLite library. It is used for automated +** testing of the SQLite library. +** +** $Id: +*/ +#include "sqliteInt.h" +#include "tcl.h" +#include +#include + +/* +** Return the number of bytes up to and including the first \u0000 +** character in *pStr. +*/ +static int utf16_length(const unsigned char *pZ){ + const unsigned char *pC1 = pZ; + const unsigned char *pC2 = pZ+1; + while( *pC1 || *pC2 ){ + pC1 += 2; + pC2 += 2; + } + return (pC1-pZ)+2; +} + +static int sqlite_utf8to16le( + void * clientData, + Tcl_Interp *interp, + int objc, + Tcl_Obj *CONST objv[] +){ + unsigned char *out; + unsigned char *in; + Tcl_Obj *res; + + if( objc!=2 ){ + Tcl_AppendResult(interp, "wrong # args: should be \"", + Tcl_GetStringFromObj(objv[0], 0), "", 0); + return TCL_ERROR; + } + + in = Tcl_GetByteArrayFromObj(objv[1], 0); + out = (unsigned char *)sqlite3utf8to16le(in, -1); + res = Tcl_NewByteArrayObj(out, utf16_length(ret)); + sqliteFree(out); + + Tcl_SetObjResult(interp, res); + + return TCL_OK; +} + +static int sqlite_utf8to16be( + void * clientData, + Tcl_Interp *interp, + int objc, + Tcl_Obj *CONST objv[] +){ + unsigned char *out; + unsigned char *in; + Tcl_Obj *res; + + if( objc!=2 ){ + Tcl_AppendResult(interp, "wrong # args: should be \"", + Tcl_GetStringFromObj(objv[0], 0), "", 0); + return TCL_ERROR; + } + + in = Tcl_GetByteArrayFromObj(objv[1], 0); + out = (unsigned char *)sqlite3utf8to16be(in, -1); + res = Tcl_NewByteArrayObj(out, utf16_length(ret)); + sqliteFree(out); + + Tcl_SetObjResult(interp, res); + + return TCL_OK; +} + +static int sqlite_utf16to16le( + void * clientData, + Tcl_Interp *interp, + int objc, + Tcl_Obj *CONST objv[] +){ + unsigned char *out; + unsigned char *in; + int in_len; + Tcl_Obj *res; + + if( objc!=2 ){ + Tcl_AppendResult(interp, "wrong # args: should be \"", + Tcl_GetStringFromObj(objv[0], 0), "", 0); + return TCL_ERROR; + } + + in = Tcl_GetByteArrayFromObj(objv[1], &in_len); + out = (unsigned char *)sqliteMalloc(in_len); + memcpy(out, in, in_len); + + sqlite3utf16to16le(out, -1); + res = Tcl_NewByteArrayObj(out, utf16_length(ret)); + sqliteFree(out); + + Tcl_SetObjResult(interp, res); + + return TCL_OK; +} + +static int sqlite_utf16to16be( + void * clientData, + Tcl_Interp *interp, + int objc, + Tcl_Obj *CONST objv[] +){ + unsigned char *out; + unsigned char *in; + int in_len; + Tcl_Obj *res; + + if( objc!=2 ){ + Tcl_AppendResult(interp, "wrong # args: should be \"", + Tcl_GetStringFromObj(objv[0], 0), "", 0); + return TCL_ERROR; + } + + in = Tcl_GetByteArrayFromObj(objv[1], &in_len); + out = (unsigned char *)sqliteMalloc(in_len); + memcpy(out, in, in_len); + + sqlite3utf16to16be(out, -1); + res = Tcl_NewByteArrayObj(out, utf16_length(ret)); + sqliteFree(out); + + Tcl_SetObjResult(interp, res); + + return TCL_OK; +} + +static int sqlite_utf16to8( + void * clientData, + Tcl_Interp *interp, + int objc, + Tcl_Obj *CONST objv[] +){ + unsigned char *out; + unsigned char *in; + Tcl_Obj *res; + + if( objc!=2 ){ + Tcl_AppendResult(interp, "wrong # args: should be \"", + Tcl_GetStringFromObj(objv[0], 0), "", 0); + return TCL_ERROR; + } + + in = Tcl_GetByteArrayFromObj(objv[1], 0); + out = sqlite3utf16to8(in, -1); + res = Tcl_NewByteArrayObj(out, strlen(ret)); + sqliteFree(out); + + Tcl_SetObjResult(interp, res); + + return TCL_OK; +} + + +/* +** Register commands with the TCL interpreter. +*/ +int Sqlitetest5_Init(Tcl_Interp *interp){ + static struct { + char *zName; + Tcl_CmdProc *xProc; + } aCmd[] = { + { "sqlite_utf16to8", (Tcl_CmdProc*)sqlite_utf16to8 }, + { "sqlite_utf8to16le", (Tcl_CmdProc*)sqlite_utf8to16le }, + { "sqlite_utf8to16be", (Tcl_CmdProc*)sqlite_utf8to16be }, + { "sqlite_utf16to16le", (Tcl_CmdProc*)sqlite_utf16to16le }, + { "sqlite_utf16to16be", (Tcl_CmdProc*)sqlite_utf16to16be } + }; + int i; + for(i=0; i +#include +#include "sqliteInt.h" + +typedef struct UtfString UtfString; +struct UtfString { + unsigned char *pZ; /* Raw string data */ + int n; /* Allocated length of pZ in bytes */ + int c; /* Number of pZ bytes already read or written */ +}; + +/* TODO: Implement this macro in os.h. It should be 1 on big-endian +** machines, and 0 on little-endian. +*/ +#define SQLITE3_NATIVE_BIGENDIAN 0 + +#if SQLITE3_NATIVE_BIGENDIAN == 1 +#define BOM_BIGENDIAN 0x0000FFFE +#define BOM_LITTLEENDIAN 0x0000FEFF +#else +#define BOM_BIGENDIAN 0x0000FEFF +#define BOM_LITTLEENDIAN 0x0000FFFE +#endif + +/* +** These two macros are used to interpret the first two bytes of the +** unsigned char array pZ as a 16-bit unsigned int. BE16() for a big-endian +** interpretation, LE16() for little-endian. +*/ +#define BE16(pZ) (((u16)((pZ)[0])<<8) + (u16)((pZ)[1])) +#define LE16(pZ) (((u16)((pZ)[1])<<8) + (u16)((pZ)[0])) + +/* +** READ_16 interprets the first two bytes of the unsigned char array pZ +** as a 16-bit unsigned int. If big_endian is non-zero the intepretation +** is big-endian, otherwise little-endian. +*/ +#define READ_16(pZ,big_endian) (big_endian?BE16(pZ):LE16(pZ)) + +/* +** Read the BOM from the start of *pStr, if one is present. Return zero +** for little-endian, non-zero for big-endian. If no BOM is present, return +** the machines native byte order. +** +** Return values: +** 1 -> big-endian string +** 0 -> little-endian string +*/ +static int readUtf16Bom(UtfString *pStr){ + /* The BOM must be the first thing read from the string */ + assert( pStr->c==0 ); + + /* If the string data consists of 1 byte or less, the BOM will make no + ** difference anyway. In this case just fall through to the default case + ** and return the native byte-order for this machine. + ** + ** Otherwise, check the first 2 bytes of the string to see if a BOM is + ** present. + */ + if( pStr->n>1 ){ + u32 bom = BE16(pStr->pZ); + if( bom==BOM_BIGENDIAN ){ + pStr->c = 2; + return 1; + } + if( bom==BOM_LITTLEENDIAN ){ + pStr->c = 2; + return 0; + } + } + + return SQLITE3_NATIVE_BIGENDIAN; +} + + +/* +** Read a single unicode character from the UTF-8 encoded string *pStr. The +** value returned is a unicode scalar value. In the case of malformed +** strings, the unicode replacement character U+FFFD may be returned. +*/ +static u32 readUtf8(UtfString *pStr){ + struct Utf8TblRow { + u8 b1_mask; + u8 b1_masked_val; + u8 b1_value_mask; + int trailing_bytes; + }; + static const struct Utf8TblRow utf8tbl[] = { + { 0x80, 0x00, 0x7F, 0 }, + { 0xE0, 0xC0, 0x1F, 1 }, + { 0xF0, 0xE0, 0x0F, 2 }, + { 0xF8, 0xF0, 0x0E, 3 }, + { 0, 0, 0, 0} + }; + + u8 b1; /* First byte of the potentially multi-byte utf-8 character */ + u32 ret = 0; /* Return value */ + int ii; + struct Utf8TblRow const *pRow; + + pRow = &(utf8tbl[0]); + + b1 = pStr->pZ[pStr->c]; + pStr->c++; + while( pRow->b1_mask && (b1&pRow->b1_mask)!=pRow->b1_masked_val ){ + pRow++; + } + if( !pRow->b1_mask ){ + return 0xFFFD; + } + + ret = (u32)(b1&pRow->b1_value_mask); + for( ii=0; iitrailing_bytes; ii++ ){ + u8 b = pStr->pZ[pStr->c+ii]; + if( (b&0xC0)!=0x80 ){ + return 0xFFFD; + } + ret = (ret<<6) + (u32)(b&0x3F); + } + + pStr->c += pRow->trailing_bytes; + return ret; +} + +/* +** Write the unicode character 'code' to the string pStr using UTF-8 +** encoding. SQLITE_NOMEM may be returned if sqlite3Malloc() fails. +*/ +static int writeUtf8(UtfString *pStr, u32 code){ + struct Utf8WriteTblRow { + u32 max_code; + int trailing_bytes; + u8 b1_and_mask; + u8 b1_or_mask; + }; + static const struct Utf8WriteTblRow utf8tbl[] = { + {0x0000007F, 0, 0x7F, 0x00}, + {0x000007FF, 1, 0xDF, 0xC0}, + {0x0000FFFF, 2, 0xEF, 0xE0}, + {0x0010FFFF, 3, 0xF7, 0xF0}, + {0x00000000, 0, 0x00, 0x00} + }; + static const struct Utf8WriteTblRow *pRow = &utf8tbl[0]; + + while( code<=pRow->max_code ){ + assert( pRow->max_code ); + pRow++; + } + + /* Ensure there is enough room left in the output buffer to write + ** this UTF-8 character. + */ + assert( (pStr->n-pStr->c)>=(pRow->trailing_bytes+1) ); + + /* Write the UTF-8 encoded character to pStr. All cases below are + ** intentionally fall-through. + */ + switch( pRow->trailing_bytes ){ + case 3: + pStr->pZ[pStr->c+3] = (((u8)code)&0x3F)|0x80; + code = code>>6; + case 2: + pStr->pZ[pStr->c+2] = (((u8)code)&0x3F)|0x80; + code = code>>6; + case 1: + pStr->pZ[pStr->c+1] = (((u8)code)&0x3F)|0x80; + code = code>>6; + case 0: + pStr->pZ[pStr->c] = (((u8)code)&(pRow->b1_and_mask))|(pRow->b1_or_mask); + } + pStr->c += (pRow->trailing_bytes + 1); + + return 0; +} + +/* +** Read a single unicode character from the UTF-16 encoded string *pStr. The +** value returned is a unicode scalar value. In the case of malformed +** strings, the unicode replacement character U+FFFD may be returned. +** +** If big_endian is true, the string is assumed to be UTF-16BE encoded. +** Otherwise, it is UTF-16LE encoded. +*/ +static u32 readUtf16(UtfString *pStr, int big_endian){ + u32 code_point; /* the first code-point in the character */ + + /* If there is only one byte of data left in the string, return the + ** replacement character. + */ + if( (pStr->n-pStr->c)==1 ){ + pStr->c++; + return (int)0xFFFD; + } + + code_point = READ_16(&(pStr->pZ[pStr->c]), big_endian); + pStr->c += 2; + + /* If this is a non-surrogate code-point, just cast it to an int and + ** return the code-point value. + */ + if( code_point<0xD800 || code_point>0xE000 ){ + return code_point; + } + + /* If this is a trailing surrogate code-point, then the string is + ** malformed; return the replacement character. + */ + if( code_point>0xDBFF ){ + return 0xFFFD; + } + + /* The code-point just read is a leading surrogate code-point. If their + ** is not enough data left or the next code-point is not a trailing + ** surrogate, return the replacement character. + */ + if( (pStr->n-pStr->c)>1 ){ + u32 code_point2 = READ_16(&pStr->pZ[pStr->c], big_endian); + if( code_point2<0xDC00 || code_point>0xDFFF ){ + return 0xFFFD; + } + pStr->c += 2; + + return ( + (((code_point&0x03C0)+0x0040)<<16) + /* uuuuu */ + ((code_point&0x003F)<<10) + /* xxxxxx */ + (code_point2&0x03FF) /* yy yyyyyyyy */ + ); + + }else{ + return (int)0xFFFD; + } + + /* not reached */ +} + +static int writeUtf16(UtfString *pStr, int code, int big_endian){ + int bytes; + unsigned char *hi_byte; + unsigned char *lo_byte; + + bytes = (code>0x0000FFFF?4:2); + + /* Ensure there is enough room left in the output buffer to write + ** this UTF-8 character. + */ + assert( (pStr->n-pStr->c)>=bytes ); + + /* Initialise hi_byte and lo_byte to point at the locations into which + ** the MSB and LSB of the (first) 16-bit unicode code-point written for + ** this character. + */ + hi_byte = (big_endian?&pStr->pZ[pStr->c]:&pStr->pZ[pStr->c+1]); + lo_byte = (big_endian?&pStr->pZ[pStr->c+1]:&pStr->pZ[pStr->c]); + + if( bytes==2 ){ + *hi_byte = (u8)((code&0x0000FF00)>>8); + *lo_byte = (u8)(code&0x000000FF); + }else{ + u32 wrd; + wrd = ((((code&0x001F0000)-0x00010000)+(code&0x0000FC00))>>10)|0x0000D800; + *hi_byte = (u8)((wrd&0x0000FF00)>>8); + *lo_byte = (u8)(wrd&0x000000FF); + + wrd = (code&0x000003FF)|0x0000DC00; + *(hi_byte+2) = (u8)((wrd&0x0000FF00)>>8); + *(lo_byte+2) = (u8)(wrd&0x000000FF); + } + + pStr->c += bytes; + + return 0; +} + +/* +** Return the number of bytes up to (but not including) the first \u0000 +** character in *pStr. +*/ +static int utf16Bytelen(const unsigned char *pZ){ + const unsigned char *pC1 = pZ; + const unsigned char *pC2 = pZ+1; + while( *pC1 || *pC2 ){ + pC1 += 2; + pC2 += 2; + } + return pC1-pZ; +} + /* ** Convert a string in UTF-16 native byte (or with a Byte-order-mark or ** "BOM") into a UTF-8 string. The UTF-8 string is written into space -** obtained from sqlit3Malloc() and must be released by the calling function. +** obtained from sqlite3Malloc() and must be released by the calling function. ** ** The parameter N is the number of bytes in the UTF-16 string. If N is ** negative, the entire string up to the first \u0000 character is translated. @@ -45,7 +348,113 @@ ** The returned UTF-8 string is always \000 terminated. */ unsigned char *sqlite3utf16to8(const void *pData, int N){ - unsigned char *in = (unsigned char *)pData; + UtfString in; + UtfString out; + int big_endian; + + out.pZ = 0; + + in.pZ = (unsigned char *)pData; + in.n = N; + in.c = 0; + + if( in.n<0 ){ + in.n = utf16Bytelen(in.pZ); + } + + /* A UTF-8 encoding of a unicode string can require at most 1.5 times as + ** much space to store as the same string encoded using UTF-16. Allocate + ** this now. + */ + out.n = (in.n*1.5) + 1; + out.pZ = sqliteMalloc(in.n); + if( !out.pZ ){ + return 0; + } + out.c = 0; + + big_endian = readUtf16Bom(&in); + while( in.c