mirror of
https://github.com/sqlite/sqlite.git
synced 2025-08-07 02:42:48 +03:00
Add code to convert between the various supported unicode encoding schemes.
Untested at this point. (CVS 1315) FossilOrigin-Name: 71260ff7f7030f56c292b43f83a213c65c9a184e
This commit is contained in:
6
main.mk
6
main.mk
@@ -55,7 +55,7 @@ TCCX = $(TCC) $(OPTS) $(THREADSAFE) $(USLEEP) -I. -I$(TOP)/src
|
|||||||
# Object files for the SQLite library.
|
# Object files for the SQLite library.
|
||||||
#
|
#
|
||||||
LIBOBJ = hash.o os.o pager.o random.o \
|
LIBOBJ = hash.o os.o pager.o random.o \
|
||||||
util.o tclsqlite.o
|
util.o tclsqlite.o utf.o
|
||||||
|
|
||||||
LIBOBJ_ORIG = attach.o auth.o btree.o btree_rb.o build.o copy.o date.o delete.o \
|
LIBOBJ_ORIG = attach.o auth.o btree.o btree_rb.o build.o copy.o date.o delete.o \
|
||||||
expr.o func.o hash.o insert.o \
|
expr.o func.o hash.o insert.o \
|
||||||
@@ -121,6 +121,7 @@ TESTSRC = \
|
|||||||
$(TOP)/src/os.c \
|
$(TOP)/src/os.c \
|
||||||
$(TOP)/src/pager.c \
|
$(TOP)/src/pager.c \
|
||||||
$(TOP)/src/test2.c \
|
$(TOP)/src/test2.c \
|
||||||
|
$(TOP)/src/test5.c \
|
||||||
$(TOP)/src/md5.c
|
$(TOP)/src/md5.c
|
||||||
|
|
||||||
TESTSRC_ORIG = \
|
TESTSRC_ORIG = \
|
||||||
@@ -230,6 +231,9 @@ os.o: $(TOP)/src/os.c $(HDR)
|
|||||||
parse.o: parse.c $(HDR)
|
parse.o: parse.c $(HDR)
|
||||||
$(TCCX) -c parse.c
|
$(TCCX) -c parse.c
|
||||||
|
|
||||||
|
utf.o: $(TOP)/src/utf.c $(HDR)
|
||||||
|
$(TCCX) -c $(TOP)/src/utf.c
|
||||||
|
|
||||||
parse.h: parse.c
|
parse.h: parse.c
|
||||||
|
|
||||||
parse.c: $(TOP)/src/parse.y lemon
|
parse.c: $(TOP)/src/parse.y lemon
|
||||||
|
21
manifest
21
manifest
@@ -1,5 +1,5 @@
|
|||||||
C Update\stest3.c\sto\swork\swith\sthe\snew\sbtree.c\sAPI.\s(CVS\s1314)
|
C Add\scode\sto\sconvert\sbetween\sthe\svarious\ssupported\sunicode\sencoding\sschemes.\nUntested\sat\sthis\spoint.\s(CVS\s1315)
|
||||||
D 2004-05-04T17:27:28
|
D 2004-05-06T23:37:52
|
||||||
F Makefile.in ab7b0d5118e2da97bac66be8684a1034e3500f5a
|
F Makefile.in ab7b0d5118e2da97bac66be8684a1034e3500f5a
|
||||||
F Makefile.linux-gcc b86a99c493a5bfb402d1d9178dcdc4bd4b32f906
|
F Makefile.linux-gcc b86a99c493a5bfb402d1d9178dcdc4bd4b32f906
|
||||||
F README f1de682fbbd94899d50aca13d387d1b3fd3be2dd
|
F README f1de682fbbd94899d50aca13d387d1b3fd3be2dd
|
||||||
@@ -15,7 +15,7 @@ F doc/lemon.html f0f682f50210928c07e562621c3b7e8ab912a538
|
|||||||
F doc/report1.txt a031aaf37b185e4fa540223cb516d3bccec7eeac
|
F doc/report1.txt a031aaf37b185e4fa540223cb516d3bccec7eeac
|
||||||
F install-sh 9d4de14ab9fb0facae2f48780b874848cbf2f895
|
F install-sh 9d4de14ab9fb0facae2f48780b874848cbf2f895
|
||||||
F ltmain.sh f6b283068efa69f06eb8aa1fe4bddfdbdeb35826
|
F ltmain.sh f6b283068efa69f06eb8aa1fe4bddfdbdeb35826
|
||||||
F main.mk 1318f38db512abb0abdbdf4d3c9dd213e2960977
|
F main.mk b0b22dffdd728c3a96c2b2e87e01fe86a017fa34
|
||||||
F publish.sh 1cd5c982388560fa91eedf6a338e210f713b35c8
|
F publish.sh 1cd5c982388560fa91eedf6a338e210f713b35c8
|
||||||
F spec.template a38492f1c1dd349fc24cb0565e08afc53045304b
|
F spec.template a38492f1c1dd349fc24cb0565e08afc53045304b
|
||||||
F sqlite.1 83f4a9d37bdf2b7ef079a82d54eaf2e3509ee6ea
|
F sqlite.1 83f4a9d37bdf2b7ef079a82d54eaf2e3509ee6ea
|
||||||
@@ -49,17 +49,18 @@ F src/random.c 775913e0b7fbd6295d21f12a7bd35b46387c44b2
|
|||||||
F src/select.c 3833e2b64cc6d249385ee44e13bf49c9ae5b903d
|
F src/select.c 3833e2b64cc6d249385ee44e13bf49c9ae5b903d
|
||||||
F src/shell.c 920af040d3a33ea8919c82cee45b424ad841cee0
|
F src/shell.c 920af040d3a33ea8919c82cee45b424ad841cee0
|
||||||
F src/sqlite.h.in 35bec264dfb4965bbfeb7e75221f8658f210c30d
|
F src/sqlite.h.in 35bec264dfb4965bbfeb7e75221f8658f210c30d
|
||||||
F src/sqliteInt.h 36c649d7f2ab0affdc44e51c681dd1d5723ee1e9
|
F src/sqliteInt.h dca24187ffe06e9207f9f400a2a0632ea483e55b
|
||||||
F src/table.c d845cb101b5afc1f7fea083c99e3d2fa7998d895
|
F src/table.c d845cb101b5afc1f7fea083c99e3d2fa7998d895
|
||||||
F src/tclsqlite.c e816201db3ea6ba857a0351547be1d4b7286e95d
|
F src/tclsqlite.c 3a5f8192ed5279a68610102a92a3a9f0cdd09e68
|
||||||
F src/test1.c 9aa62b89d420e6763b5e7ae89a47f6cf87370477
|
F src/test1.c 9aa62b89d420e6763b5e7ae89a47f6cf87370477
|
||||||
F src/test2.c 9d611c45e1b07039a2bd95f5ea73178362b23229
|
F src/test2.c 9d611c45e1b07039a2bd95f5ea73178362b23229
|
||||||
F src/test3.c 7d06add423e4a90ec1a2e8d02006f82081109558
|
F src/test3.c 7d06add423e4a90ec1a2e8d02006f82081109558
|
||||||
F src/test4.c 6e3e31acfaf21d66420fc35fda5b17dc0000cc8d
|
F src/test4.c 6e3e31acfaf21d66420fc35fda5b17dc0000cc8d
|
||||||
|
F src/test5.c 8a3dd24fa84b497243014f92c6afa1ee5ac735b6
|
||||||
F src/tokenize.c 6676b946fd8825b67ab52140af4fdc57a70bda48
|
F src/tokenize.c 6676b946fd8825b67ab52140af4fdc57a70bda48
|
||||||
F src/trigger.c a9927b57c865b6f3df3fb5e40c9824d722660ded
|
F src/trigger.c a9927b57c865b6f3df3fb5e40c9824d722660ded
|
||||||
F src/update.c 4c50328ebc127852bde8e2950eb8933234802c21
|
F src/update.c 4c50328ebc127852bde8e2950eb8933234802c21
|
||||||
F src/utf.c 8d74ddbfffdc1f2e87bfc11b8c1e2a806313a715
|
F src/utf.c 1f2ac0c4247258196ce97575144e7793a46be2cd
|
||||||
F src/util.c b2287b07ddf55ef7aaa8888a9473123995a69f40
|
F src/util.c b2287b07ddf55ef7aaa8888a9473123995a69f40
|
||||||
F src/vacuum.c a4e8464c9f6d60659c5343e9d62c742463227820
|
F src/vacuum.c a4e8464c9f6d60659c5343e9d62c742463227820
|
||||||
F src/vdbe.c 7c33f761fdc799633468766fb53eda4301daa6b3
|
F src/vdbe.c 7c33f761fdc799633468766fb53eda4301daa6b3
|
||||||
@@ -189,7 +190,7 @@ F www/sqlite.tcl 3c83b08cf9f18aa2d69453ff441a36c40e431604
|
|||||||
F www/tclsqlite.tcl b9271d44dcf147a93c98f8ecf28c927307abd6da
|
F www/tclsqlite.tcl b9271d44dcf147a93c98f8ecf28c927307abd6da
|
||||||
F www/vdbe.tcl 9b9095d4495f37697fd1935d10e14c6015e80aa1
|
F www/vdbe.tcl 9b9095d4495f37697fd1935d10e14c6015e80aa1
|
||||||
F www/whentouse.tcl a8335bce47cc2fddb07f19052cb0cb4d9129a8e4
|
F www/whentouse.tcl a8335bce47cc2fddb07f19052cb0cb4d9129a8e4
|
||||||
P 89b42c468f437003f74a1785370e75b2585fa9e2
|
P bfb3234dc60c9152fdf0a16b887089365443f5ed
|
||||||
R a73f1cc52bfc16883d27a39f1dbd6caa
|
R f2be046f24e5df4901763f3eff8c0a1b
|
||||||
U drh
|
U danielk1977
|
||||||
Z 27226ab3419af317804aa8669b342fd3
|
Z 887f209fe103a2fa5cebdc4c0fa96afc
|
||||||
|
@@ -1 +1 @@
|
|||||||
bfb3234dc60c9152fdf0a16b887089365443f5ed
|
71260ff7f7030f56c292b43f83a213c65c9a184e
|
@@ -11,7 +11,7 @@
|
|||||||
*************************************************************************
|
*************************************************************************
|
||||||
** Internal interface definitions for SQLite.
|
** Internal interface definitions for SQLite.
|
||||||
**
|
**
|
||||||
** @(#) $Id: sqliteInt.h,v 1.221 2004/04/26 14:10:22 drh Exp $
|
** @(#) $Id: sqliteInt.h,v 1.222 2004/05/06 23:37:53 danielk1977 Exp $
|
||||||
*/
|
*/
|
||||||
#include "config.h"
|
#include "config.h"
|
||||||
#include "sqlite.h"
|
#include "sqlite.h"
|
||||||
@@ -1268,3 +1268,10 @@ int sqliteFixTriggerStep(DbFixer*, TriggerStep*);
|
|||||||
double sqliteAtoF(const char *z, const char **);
|
double sqliteAtoF(const char *z, const char **);
|
||||||
char *sqlite_snprintf(int,char*,const char*,...);
|
char *sqlite_snprintf(int,char*,const char*,...);
|
||||||
int sqliteFitsIn32Bits(const char *);
|
int sqliteFitsIn32Bits(const char *);
|
||||||
|
|
||||||
|
unsigned char *sqlite3utf16to8(const void *pData, int N);
|
||||||
|
void *sqlite3utf8to16be(const unsigned char *pIn, int N);
|
||||||
|
void *sqlite3utf8to16le(const unsigned char *pIn, int N);
|
||||||
|
void sqlite3utf16to16le(void *pData, int N);
|
||||||
|
void sqlite3utf16to16be(void *pData, int N);
|
||||||
|
|
||||||
|
@@ -11,7 +11,7 @@
|
|||||||
*************************************************************************
|
*************************************************************************
|
||||||
** A TCL Interface to SQLite
|
** A TCL Interface to SQLite
|
||||||
**
|
**
|
||||||
** $Id: tclsqlite.c,v 1.60 2004/04/26 14:10:22 drh Exp $
|
** $Id: tclsqlite.c,v 1.61 2004/05/06 23:37:53 danielk1977 Exp $
|
||||||
*/
|
*/
|
||||||
#ifndef NO_TCL /* Omit this whole file if TCL is unavailable */
|
#ifndef NO_TCL /* Omit this whole file if TCL is unavailable */
|
||||||
|
|
||||||
@@ -1208,11 +1208,13 @@ int TCLSH_MAIN(int argc, char **argv){
|
|||||||
extern int Sqlitetest2_Init(Tcl_Interp*);
|
extern int Sqlitetest2_Init(Tcl_Interp*);
|
||||||
extern int Sqlitetest3_Init(Tcl_Interp*);
|
extern int Sqlitetest3_Init(Tcl_Interp*);
|
||||||
extern int Sqlitetest4_Init(Tcl_Interp*);
|
extern int Sqlitetest4_Init(Tcl_Interp*);
|
||||||
|
extern int Sqlitetest5_Init(Tcl_Interp*);
|
||||||
extern int Md5_Init(Tcl_Interp*);
|
extern int Md5_Init(Tcl_Interp*);
|
||||||
/* Sqlitetest1_Init(interp); */
|
/* Sqlitetest1_Init(interp); */
|
||||||
Sqlitetest2_Init(interp);
|
Sqlitetest2_Init(interp);
|
||||||
/* Sqlitetest3_Init(interp); */
|
/* Sqlitetest3_Init(interp); */
|
||||||
/* Sqlitetest4_Init(interp); */
|
/* Sqlitetest4_Init(interp); */
|
||||||
|
Sqlitetest5_Init(interp);
|
||||||
Md5_Init(interp);
|
Md5_Init(interp);
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
196
src/test5.c
Normal file
196
src/test5.c
Normal file
@@ -0,0 +1,196 @@
|
|||||||
|
/*
|
||||||
|
** 2001 September 15
|
||||||
|
**
|
||||||
|
** The author disclaims copyright to this source code. In place of
|
||||||
|
** a legal notice, here is a blessing:
|
||||||
|
**
|
||||||
|
** May you do good and not evil.
|
||||||
|
** May you find forgiveness for yourself and forgive others.
|
||||||
|
** May you share freely, never taking more than you give.
|
||||||
|
**
|
||||||
|
*************************************************************************
|
||||||
|
** Code for testing the utf.c module in SQLite. This code
|
||||||
|
** is not included in the SQLite library. It is used for automated
|
||||||
|
** testing of the SQLite library.
|
||||||
|
**
|
||||||
|
** $Id:
|
||||||
|
*/
|
||||||
|
#include "sqliteInt.h"
|
||||||
|
#include "tcl.h"
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <string.h>
|
||||||
|
|
||||||
|
/*
|
||||||
|
** Return the number of bytes up to and including the first \u0000
|
||||||
|
** character in *pStr.
|
||||||
|
*/
|
||||||
|
static int utf16_length(const unsigned char *pZ){
|
||||||
|
const unsigned char *pC1 = pZ;
|
||||||
|
const unsigned char *pC2 = pZ+1;
|
||||||
|
while( *pC1 || *pC2 ){
|
||||||
|
pC1 += 2;
|
||||||
|
pC2 += 2;
|
||||||
|
}
|
||||||
|
return (pC1-pZ)+2;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int sqlite_utf8to16le(
|
||||||
|
void * clientData,
|
||||||
|
Tcl_Interp *interp,
|
||||||
|
int objc,
|
||||||
|
Tcl_Obj *CONST objv[]
|
||||||
|
){
|
||||||
|
unsigned char *out;
|
||||||
|
unsigned char *in;
|
||||||
|
Tcl_Obj *res;
|
||||||
|
|
||||||
|
if( objc!=2 ){
|
||||||
|
Tcl_AppendResult(interp, "wrong # args: should be \"",
|
||||||
|
Tcl_GetStringFromObj(objv[0], 0), "<utf-8 encoded-string>", 0);
|
||||||
|
return TCL_ERROR;
|
||||||
|
}
|
||||||
|
|
||||||
|
in = Tcl_GetByteArrayFromObj(objv[1], 0);
|
||||||
|
out = (unsigned char *)sqlite3utf8to16le(in, -1);
|
||||||
|
res = Tcl_NewByteArrayObj(out, utf16_length(ret));
|
||||||
|
sqliteFree(out);
|
||||||
|
|
||||||
|
Tcl_SetObjResult(interp, res);
|
||||||
|
|
||||||
|
return TCL_OK;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int sqlite_utf8to16be(
|
||||||
|
void * clientData,
|
||||||
|
Tcl_Interp *interp,
|
||||||
|
int objc,
|
||||||
|
Tcl_Obj *CONST objv[]
|
||||||
|
){
|
||||||
|
unsigned char *out;
|
||||||
|
unsigned char *in;
|
||||||
|
Tcl_Obj *res;
|
||||||
|
|
||||||
|
if( objc!=2 ){
|
||||||
|
Tcl_AppendResult(interp, "wrong # args: should be \"",
|
||||||
|
Tcl_GetStringFromObj(objv[0], 0), "<utf-8 encoded-string>", 0);
|
||||||
|
return TCL_ERROR;
|
||||||
|
}
|
||||||
|
|
||||||
|
in = Tcl_GetByteArrayFromObj(objv[1], 0);
|
||||||
|
out = (unsigned char *)sqlite3utf8to16be(in, -1);
|
||||||
|
res = Tcl_NewByteArrayObj(out, utf16_length(ret));
|
||||||
|
sqliteFree(out);
|
||||||
|
|
||||||
|
Tcl_SetObjResult(interp, res);
|
||||||
|
|
||||||
|
return TCL_OK;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int sqlite_utf16to16le(
|
||||||
|
void * clientData,
|
||||||
|
Tcl_Interp *interp,
|
||||||
|
int objc,
|
||||||
|
Tcl_Obj *CONST objv[]
|
||||||
|
){
|
||||||
|
unsigned char *out;
|
||||||
|
unsigned char *in;
|
||||||
|
int in_len;
|
||||||
|
Tcl_Obj *res;
|
||||||
|
|
||||||
|
if( objc!=2 ){
|
||||||
|
Tcl_AppendResult(interp, "wrong # args: should be \"",
|
||||||
|
Tcl_GetStringFromObj(objv[0], 0), "<utf-16 encoded-string>", 0);
|
||||||
|
return TCL_ERROR;
|
||||||
|
}
|
||||||
|
|
||||||
|
in = Tcl_GetByteArrayFromObj(objv[1], &in_len);
|
||||||
|
out = (unsigned char *)sqliteMalloc(in_len);
|
||||||
|
memcpy(out, in, in_len);
|
||||||
|
|
||||||
|
sqlite3utf16to16le(out, -1);
|
||||||
|
res = Tcl_NewByteArrayObj(out, utf16_length(ret));
|
||||||
|
sqliteFree(out);
|
||||||
|
|
||||||
|
Tcl_SetObjResult(interp, res);
|
||||||
|
|
||||||
|
return TCL_OK;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int sqlite_utf16to16be(
|
||||||
|
void * clientData,
|
||||||
|
Tcl_Interp *interp,
|
||||||
|
int objc,
|
||||||
|
Tcl_Obj *CONST objv[]
|
||||||
|
){
|
||||||
|
unsigned char *out;
|
||||||
|
unsigned char *in;
|
||||||
|
int in_len;
|
||||||
|
Tcl_Obj *res;
|
||||||
|
|
||||||
|
if( objc!=2 ){
|
||||||
|
Tcl_AppendResult(interp, "wrong # args: should be \"",
|
||||||
|
Tcl_GetStringFromObj(objv[0], 0), "<utf-16 encoded-string>", 0);
|
||||||
|
return TCL_ERROR;
|
||||||
|
}
|
||||||
|
|
||||||
|
in = Tcl_GetByteArrayFromObj(objv[1], &in_len);
|
||||||
|
out = (unsigned char *)sqliteMalloc(in_len);
|
||||||
|
memcpy(out, in, in_len);
|
||||||
|
|
||||||
|
sqlite3utf16to16be(out, -1);
|
||||||
|
res = Tcl_NewByteArrayObj(out, utf16_length(ret));
|
||||||
|
sqliteFree(out);
|
||||||
|
|
||||||
|
Tcl_SetObjResult(interp, res);
|
||||||
|
|
||||||
|
return TCL_OK;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int sqlite_utf16to8(
|
||||||
|
void * clientData,
|
||||||
|
Tcl_Interp *interp,
|
||||||
|
int objc,
|
||||||
|
Tcl_Obj *CONST objv[]
|
||||||
|
){
|
||||||
|
unsigned char *out;
|
||||||
|
unsigned char *in;
|
||||||
|
Tcl_Obj *res;
|
||||||
|
|
||||||
|
if( objc!=2 ){
|
||||||
|
Tcl_AppendResult(interp, "wrong # args: should be \"",
|
||||||
|
Tcl_GetStringFromObj(objv[0], 0), "<utf-16 encoded-string>", 0);
|
||||||
|
return TCL_ERROR;
|
||||||
|
}
|
||||||
|
|
||||||
|
in = Tcl_GetByteArrayFromObj(objv[1], 0);
|
||||||
|
out = sqlite3utf16to8(in, -1);
|
||||||
|
res = Tcl_NewByteArrayObj(out, strlen(ret));
|
||||||
|
sqliteFree(out);
|
||||||
|
|
||||||
|
Tcl_SetObjResult(interp, res);
|
||||||
|
|
||||||
|
return TCL_OK;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
** Register commands with the TCL interpreter.
|
||||||
|
*/
|
||||||
|
int Sqlitetest5_Init(Tcl_Interp *interp){
|
||||||
|
static struct {
|
||||||
|
char *zName;
|
||||||
|
Tcl_CmdProc *xProc;
|
||||||
|
} aCmd[] = {
|
||||||
|
{ "sqlite_utf16to8", (Tcl_CmdProc*)sqlite_utf16to8 },
|
||||||
|
{ "sqlite_utf8to16le", (Tcl_CmdProc*)sqlite_utf8to16le },
|
||||||
|
{ "sqlite_utf8to16be", (Tcl_CmdProc*)sqlite_utf8to16be },
|
||||||
|
{ "sqlite_utf16to16le", (Tcl_CmdProc*)sqlite_utf16to16le },
|
||||||
|
{ "sqlite_utf16to16be", (Tcl_CmdProc*)sqlite_utf16to16be }
|
||||||
|
};
|
||||||
|
int i;
|
||||||
|
for(i=0; i<sizeof(aCmd)/sizeof(aCmd[0]); i++){
|
||||||
|
Tcl_CreateCommand(interp, aCmd[i].zName, aCmd[i].xProc, 0, 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
return TCL_OK;
|
||||||
|
}
|
444
src/utf.c
444
src/utf.c
@@ -12,7 +12,7 @@
|
|||||||
** This file contains routines used to translate between UTF-8,
|
** This file contains routines used to translate between UTF-8,
|
||||||
** UTF-16, UTF-16BE, and UTF-16LE.
|
** UTF-16, UTF-16BE, and UTF-16LE.
|
||||||
**
|
**
|
||||||
** $Id: utf.c,v 1.1 2004/05/04 15:00:47 drh Exp $
|
** $Id: utf.c,v 1.2 2004/05/06 23:37:53 danielk1977 Exp $
|
||||||
**
|
**
|
||||||
** Notes on UTF-8:
|
** Notes on UTF-8:
|
||||||
**
|
**
|
||||||
@@ -29,15 +29,318 @@
|
|||||||
** 110110wwwwxxxxxx 110111yyyyyyyyyy 000uuuuu xxxxxxyy yyyyyyyy
|
** 110110wwwwxxxxxx 110111yyyyyyyyyy 000uuuuu xxxxxxyy yyyyyyyy
|
||||||
** xxxxxxxxyyyyyyyy 00000000 xxxxxxxx yyyyyyyy
|
** xxxxxxxxyyyyyyyy 00000000 xxxxxxxx yyyyyyyy
|
||||||
**
|
**
|
||||||
|
**
|
||||||
** BOM or Byte Order Mark:
|
** BOM or Byte Order Mark:
|
||||||
** 0xff 0xfe little-endian utf-16 follows
|
** 0xff 0xfe little-endian utf-16 follows
|
||||||
** 0xfe 0xff big-endian utf-16 follows
|
** 0xfe 0xff big-endian utf-16 follows
|
||||||
|
**
|
||||||
|
**
|
||||||
|
** Handling of malformed strings:
|
||||||
|
**
|
||||||
|
** SQLite accepts and processes malformed strings without an error wherever
|
||||||
|
** possible. However this is not possible when converting between UTF-8 and
|
||||||
|
** UTF-16.
|
||||||
|
**
|
||||||
|
** When converting malformed UTF-8 strings to UTF-16, one instance of the
|
||||||
|
** replacement character U+FFFD for each byte that cannot be interpeted as
|
||||||
|
** part of a valid unicode character.
|
||||||
|
**
|
||||||
|
** When converting malformed UTF-16 strings to UTF-8, one instance of the
|
||||||
|
** replacement character U+FFFD for each pair of bytes that cannot be
|
||||||
|
** interpeted as part of a valid unicode character.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
#include <assert.h>
|
||||||
|
#include <unistd.h>
|
||||||
|
#include "sqliteInt.h"
|
||||||
|
|
||||||
|
typedef struct UtfString UtfString;
|
||||||
|
struct UtfString {
|
||||||
|
unsigned char *pZ; /* Raw string data */
|
||||||
|
int n; /* Allocated length of pZ in bytes */
|
||||||
|
int c; /* Number of pZ bytes already read or written */
|
||||||
|
};
|
||||||
|
|
||||||
|
/* TODO: Implement this macro in os.h. It should be 1 on big-endian
|
||||||
|
** machines, and 0 on little-endian.
|
||||||
|
*/
|
||||||
|
#define SQLITE3_NATIVE_BIGENDIAN 0
|
||||||
|
|
||||||
|
#if SQLITE3_NATIVE_BIGENDIAN == 1
|
||||||
|
#define BOM_BIGENDIAN 0x0000FFFE
|
||||||
|
#define BOM_LITTLEENDIAN 0x0000FEFF
|
||||||
|
#else
|
||||||
|
#define BOM_BIGENDIAN 0x0000FEFF
|
||||||
|
#define BOM_LITTLEENDIAN 0x0000FFFE
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/*
|
||||||
|
** These two macros are used to interpret the first two bytes of the
|
||||||
|
** unsigned char array pZ as a 16-bit unsigned int. BE16() for a big-endian
|
||||||
|
** interpretation, LE16() for little-endian.
|
||||||
|
*/
|
||||||
|
#define BE16(pZ) (((u16)((pZ)[0])<<8) + (u16)((pZ)[1]))
|
||||||
|
#define LE16(pZ) (((u16)((pZ)[1])<<8) + (u16)((pZ)[0]))
|
||||||
|
|
||||||
|
/*
|
||||||
|
** READ_16 interprets the first two bytes of the unsigned char array pZ
|
||||||
|
** as a 16-bit unsigned int. If big_endian is non-zero the intepretation
|
||||||
|
** is big-endian, otherwise little-endian.
|
||||||
|
*/
|
||||||
|
#define READ_16(pZ,big_endian) (big_endian?BE16(pZ):LE16(pZ))
|
||||||
|
|
||||||
|
/*
|
||||||
|
** Read the BOM from the start of *pStr, if one is present. Return zero
|
||||||
|
** for little-endian, non-zero for big-endian. If no BOM is present, return
|
||||||
|
** the machines native byte order.
|
||||||
|
**
|
||||||
|
** Return values:
|
||||||
|
** 1 -> big-endian string
|
||||||
|
** 0 -> little-endian string
|
||||||
|
*/
|
||||||
|
static int readUtf16Bom(UtfString *pStr){
|
||||||
|
/* The BOM must be the first thing read from the string */
|
||||||
|
assert( pStr->c==0 );
|
||||||
|
|
||||||
|
/* If the string data consists of 1 byte or less, the BOM will make no
|
||||||
|
** difference anyway. In this case just fall through to the default case
|
||||||
|
** and return the native byte-order for this machine.
|
||||||
|
**
|
||||||
|
** Otherwise, check the first 2 bytes of the string to see if a BOM is
|
||||||
|
** present.
|
||||||
|
*/
|
||||||
|
if( pStr->n>1 ){
|
||||||
|
u32 bom = BE16(pStr->pZ);
|
||||||
|
if( bom==BOM_BIGENDIAN ){
|
||||||
|
pStr->c = 2;
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
if( bom==BOM_LITTLEENDIAN ){
|
||||||
|
pStr->c = 2;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return SQLITE3_NATIVE_BIGENDIAN;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
** Read a single unicode character from the UTF-8 encoded string *pStr. The
|
||||||
|
** value returned is a unicode scalar value. In the case of malformed
|
||||||
|
** strings, the unicode replacement character U+FFFD may be returned.
|
||||||
|
*/
|
||||||
|
static u32 readUtf8(UtfString *pStr){
|
||||||
|
struct Utf8TblRow {
|
||||||
|
u8 b1_mask;
|
||||||
|
u8 b1_masked_val;
|
||||||
|
u8 b1_value_mask;
|
||||||
|
int trailing_bytes;
|
||||||
|
};
|
||||||
|
static const struct Utf8TblRow utf8tbl[] = {
|
||||||
|
{ 0x80, 0x00, 0x7F, 0 },
|
||||||
|
{ 0xE0, 0xC0, 0x1F, 1 },
|
||||||
|
{ 0xF0, 0xE0, 0x0F, 2 },
|
||||||
|
{ 0xF8, 0xF0, 0x0E, 3 },
|
||||||
|
{ 0, 0, 0, 0}
|
||||||
|
};
|
||||||
|
|
||||||
|
u8 b1; /* First byte of the potentially multi-byte utf-8 character */
|
||||||
|
u32 ret = 0; /* Return value */
|
||||||
|
int ii;
|
||||||
|
struct Utf8TblRow const *pRow;
|
||||||
|
|
||||||
|
pRow = &(utf8tbl[0]);
|
||||||
|
|
||||||
|
b1 = pStr->pZ[pStr->c];
|
||||||
|
pStr->c++;
|
||||||
|
while( pRow->b1_mask && (b1&pRow->b1_mask)!=pRow->b1_masked_val ){
|
||||||
|
pRow++;
|
||||||
|
}
|
||||||
|
if( !pRow->b1_mask ){
|
||||||
|
return 0xFFFD;
|
||||||
|
}
|
||||||
|
|
||||||
|
ret = (u32)(b1&pRow->b1_value_mask);
|
||||||
|
for( ii=0; ii<pRow->trailing_bytes; ii++ ){
|
||||||
|
u8 b = pStr->pZ[pStr->c+ii];
|
||||||
|
if( (b&0xC0)!=0x80 ){
|
||||||
|
return 0xFFFD;
|
||||||
|
}
|
||||||
|
ret = (ret<<6) + (u32)(b&0x3F);
|
||||||
|
}
|
||||||
|
|
||||||
|
pStr->c += pRow->trailing_bytes;
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
** Write the unicode character 'code' to the string pStr using UTF-8
|
||||||
|
** encoding. SQLITE_NOMEM may be returned if sqlite3Malloc() fails.
|
||||||
|
*/
|
||||||
|
static int writeUtf8(UtfString *pStr, u32 code){
|
||||||
|
struct Utf8WriteTblRow {
|
||||||
|
u32 max_code;
|
||||||
|
int trailing_bytes;
|
||||||
|
u8 b1_and_mask;
|
||||||
|
u8 b1_or_mask;
|
||||||
|
};
|
||||||
|
static const struct Utf8WriteTblRow utf8tbl[] = {
|
||||||
|
{0x0000007F, 0, 0x7F, 0x00},
|
||||||
|
{0x000007FF, 1, 0xDF, 0xC0},
|
||||||
|
{0x0000FFFF, 2, 0xEF, 0xE0},
|
||||||
|
{0x0010FFFF, 3, 0xF7, 0xF0},
|
||||||
|
{0x00000000, 0, 0x00, 0x00}
|
||||||
|
};
|
||||||
|
static const struct Utf8WriteTblRow *pRow = &utf8tbl[0];
|
||||||
|
|
||||||
|
while( code<=pRow->max_code ){
|
||||||
|
assert( pRow->max_code );
|
||||||
|
pRow++;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Ensure there is enough room left in the output buffer to write
|
||||||
|
** this UTF-8 character.
|
||||||
|
*/
|
||||||
|
assert( (pStr->n-pStr->c)>=(pRow->trailing_bytes+1) );
|
||||||
|
|
||||||
|
/* Write the UTF-8 encoded character to pStr. All cases below are
|
||||||
|
** intentionally fall-through.
|
||||||
|
*/
|
||||||
|
switch( pRow->trailing_bytes ){
|
||||||
|
case 3:
|
||||||
|
pStr->pZ[pStr->c+3] = (((u8)code)&0x3F)|0x80;
|
||||||
|
code = code>>6;
|
||||||
|
case 2:
|
||||||
|
pStr->pZ[pStr->c+2] = (((u8)code)&0x3F)|0x80;
|
||||||
|
code = code>>6;
|
||||||
|
case 1:
|
||||||
|
pStr->pZ[pStr->c+1] = (((u8)code)&0x3F)|0x80;
|
||||||
|
code = code>>6;
|
||||||
|
case 0:
|
||||||
|
pStr->pZ[pStr->c] = (((u8)code)&(pRow->b1_and_mask))|(pRow->b1_or_mask);
|
||||||
|
}
|
||||||
|
pStr->c += (pRow->trailing_bytes + 1);
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
** Read a single unicode character from the UTF-16 encoded string *pStr. The
|
||||||
|
** value returned is a unicode scalar value. In the case of malformed
|
||||||
|
** strings, the unicode replacement character U+FFFD may be returned.
|
||||||
|
**
|
||||||
|
** If big_endian is true, the string is assumed to be UTF-16BE encoded.
|
||||||
|
** Otherwise, it is UTF-16LE encoded.
|
||||||
|
*/
|
||||||
|
static u32 readUtf16(UtfString *pStr, int big_endian){
|
||||||
|
u32 code_point; /* the first code-point in the character */
|
||||||
|
|
||||||
|
/* If there is only one byte of data left in the string, return the
|
||||||
|
** replacement character.
|
||||||
|
*/
|
||||||
|
if( (pStr->n-pStr->c)==1 ){
|
||||||
|
pStr->c++;
|
||||||
|
return (int)0xFFFD;
|
||||||
|
}
|
||||||
|
|
||||||
|
code_point = READ_16(&(pStr->pZ[pStr->c]), big_endian);
|
||||||
|
pStr->c += 2;
|
||||||
|
|
||||||
|
/* If this is a non-surrogate code-point, just cast it to an int and
|
||||||
|
** return the code-point value.
|
||||||
|
*/
|
||||||
|
if( code_point<0xD800 || code_point>0xE000 ){
|
||||||
|
return code_point;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* If this is a trailing surrogate code-point, then the string is
|
||||||
|
** malformed; return the replacement character.
|
||||||
|
*/
|
||||||
|
if( code_point>0xDBFF ){
|
||||||
|
return 0xFFFD;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* The code-point just read is a leading surrogate code-point. If their
|
||||||
|
** is not enough data left or the next code-point is not a trailing
|
||||||
|
** surrogate, return the replacement character.
|
||||||
|
*/
|
||||||
|
if( (pStr->n-pStr->c)>1 ){
|
||||||
|
u32 code_point2 = READ_16(&pStr->pZ[pStr->c], big_endian);
|
||||||
|
if( code_point2<0xDC00 || code_point>0xDFFF ){
|
||||||
|
return 0xFFFD;
|
||||||
|
}
|
||||||
|
pStr->c += 2;
|
||||||
|
|
||||||
|
return (
|
||||||
|
(((code_point&0x03C0)+0x0040)<<16) + /* uuuuu */
|
||||||
|
((code_point&0x003F)<<10) + /* xxxxxx */
|
||||||
|
(code_point2&0x03FF) /* yy yyyyyyyy */
|
||||||
|
);
|
||||||
|
|
||||||
|
}else{
|
||||||
|
return (int)0xFFFD;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* not reached */
|
||||||
|
}
|
||||||
|
|
||||||
|
static int writeUtf16(UtfString *pStr, int code, int big_endian){
|
||||||
|
int bytes;
|
||||||
|
unsigned char *hi_byte;
|
||||||
|
unsigned char *lo_byte;
|
||||||
|
|
||||||
|
bytes = (code>0x0000FFFF?4:2);
|
||||||
|
|
||||||
|
/* Ensure there is enough room left in the output buffer to write
|
||||||
|
** this UTF-8 character.
|
||||||
|
*/
|
||||||
|
assert( (pStr->n-pStr->c)>=bytes );
|
||||||
|
|
||||||
|
/* Initialise hi_byte and lo_byte to point at the locations into which
|
||||||
|
** the MSB and LSB of the (first) 16-bit unicode code-point written for
|
||||||
|
** this character.
|
||||||
|
*/
|
||||||
|
hi_byte = (big_endian?&pStr->pZ[pStr->c]:&pStr->pZ[pStr->c+1]);
|
||||||
|
lo_byte = (big_endian?&pStr->pZ[pStr->c+1]:&pStr->pZ[pStr->c]);
|
||||||
|
|
||||||
|
if( bytes==2 ){
|
||||||
|
*hi_byte = (u8)((code&0x0000FF00)>>8);
|
||||||
|
*lo_byte = (u8)(code&0x000000FF);
|
||||||
|
}else{
|
||||||
|
u32 wrd;
|
||||||
|
wrd = ((((code&0x001F0000)-0x00010000)+(code&0x0000FC00))>>10)|0x0000D800;
|
||||||
|
*hi_byte = (u8)((wrd&0x0000FF00)>>8);
|
||||||
|
*lo_byte = (u8)(wrd&0x000000FF);
|
||||||
|
|
||||||
|
wrd = (code&0x000003FF)|0x0000DC00;
|
||||||
|
*(hi_byte+2) = (u8)((wrd&0x0000FF00)>>8);
|
||||||
|
*(lo_byte+2) = (u8)(wrd&0x000000FF);
|
||||||
|
}
|
||||||
|
|
||||||
|
pStr->c += bytes;
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
** Return the number of bytes up to (but not including) the first \u0000
|
||||||
|
** character in *pStr.
|
||||||
|
*/
|
||||||
|
static int utf16Bytelen(const unsigned char *pZ){
|
||||||
|
const unsigned char *pC1 = pZ;
|
||||||
|
const unsigned char *pC2 = pZ+1;
|
||||||
|
while( *pC1 || *pC2 ){
|
||||||
|
pC1 += 2;
|
||||||
|
pC2 += 2;
|
||||||
|
}
|
||||||
|
return pC1-pZ;
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
** Convert a string in UTF-16 native byte (or with a Byte-order-mark or
|
** Convert a string in UTF-16 native byte (or with a Byte-order-mark or
|
||||||
** "BOM") into a UTF-8 string. The UTF-8 string is written into space
|
** "BOM") into a UTF-8 string. The UTF-8 string is written into space
|
||||||
** obtained from sqlit3Malloc() and must be released by the calling function.
|
** obtained from sqlite3Malloc() and must be released by the calling function.
|
||||||
**
|
**
|
||||||
** The parameter N is the number of bytes in the UTF-16 string. If N is
|
** The parameter N is the number of bytes in the UTF-16 string. If N is
|
||||||
** negative, the entire string up to the first \u0000 character is translated.
|
** negative, the entire string up to the first \u0000 character is translated.
|
||||||
@@ -45,7 +348,113 @@
|
|||||||
** The returned UTF-8 string is always \000 terminated.
|
** The returned UTF-8 string is always \000 terminated.
|
||||||
*/
|
*/
|
||||||
unsigned char *sqlite3utf16to8(const void *pData, int N){
|
unsigned char *sqlite3utf16to8(const void *pData, int N){
|
||||||
unsigned char *in = (unsigned char *)pData;
|
UtfString in;
|
||||||
|
UtfString out;
|
||||||
|
int big_endian;
|
||||||
|
|
||||||
|
out.pZ = 0;
|
||||||
|
|
||||||
|
in.pZ = (unsigned char *)pData;
|
||||||
|
in.n = N;
|
||||||
|
in.c = 0;
|
||||||
|
|
||||||
|
if( in.n<0 ){
|
||||||
|
in.n = utf16Bytelen(in.pZ);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* A UTF-8 encoding of a unicode string can require at most 1.5 times as
|
||||||
|
** much space to store as the same string encoded using UTF-16. Allocate
|
||||||
|
** this now.
|
||||||
|
*/
|
||||||
|
out.n = (in.n*1.5) + 1;
|
||||||
|
out.pZ = sqliteMalloc(in.n);
|
||||||
|
if( !out.pZ ){
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
out.c = 0;
|
||||||
|
|
||||||
|
big_endian = readUtf16Bom(&in);
|
||||||
|
while( in.c<in.n ){
|
||||||
|
writeUtf8(&out, readUtf16(&in, big_endian));
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Add the NULL-terminator character */
|
||||||
|
assert( out.c<out.n );
|
||||||
|
out.pZ[out.c] = 0x00;
|
||||||
|
|
||||||
|
return out.pZ;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void *utf8toUtf16(const unsigned char *pIn, int N, int big_endian){
|
||||||
|
UtfString in;
|
||||||
|
UtfString out;
|
||||||
|
|
||||||
|
in.pZ = (unsigned char *)pIn;
|
||||||
|
in.n = N;
|
||||||
|
in.c = 0;
|
||||||
|
|
||||||
|
if( in.n<0 ){
|
||||||
|
in.n = strlen(in.pZ);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* A UTF-16 encoding of a unicode string can require at most twice as
|
||||||
|
** much space to store as the same string encoded using UTF-8. Allocate
|
||||||
|
** this now.
|
||||||
|
*/
|
||||||
|
out.n = (in.n*2) + 2;
|
||||||
|
out.pZ = sqliteMalloc(in.n);
|
||||||
|
if( !out.pZ ){
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
out.c = 0;
|
||||||
|
|
||||||
|
while( in.c<in.n ){
|
||||||
|
writeUtf16(&out, readUtf8(&in), big_endian);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Add the NULL-terminator character */
|
||||||
|
assert( (out.c+1)<out.n );
|
||||||
|
out.pZ[out.c] = 0x00;
|
||||||
|
out.pZ[out.c+1] = 0x00;
|
||||||
|
|
||||||
|
return out.pZ;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
** Translate UTF-8 to UTF-16BE or UTF-16LE
|
||||||
|
*/
|
||||||
|
void *sqlite3utf8to16be(const unsigned char *pIn, int N){
|
||||||
|
return utf8toUtf16(pIn, N, 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
void *sqlite3utf8to16le(const unsigned char *pIn, int N){
|
||||||
|
return utf8toUtf16(pIn, N, 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
** This routine does the work for sqlite3utf16to16le() and
|
||||||
|
** sqlite3utf16to16be(). If big_endian is 1 the input string is
|
||||||
|
** transformed in place to UTF-16BE encoding. If big_endian is 0 then
|
||||||
|
** the input is transformed to UTF-16LE.
|
||||||
|
**
|
||||||
|
** Unless the first two bytes of the input string is a BOM, the input is
|
||||||
|
** assumed to be UTF-16 encoded using the machines native byte ordering.
|
||||||
|
*/
|
||||||
|
static void utf16to16(void *pData, int N, int big_endian){
|
||||||
|
UtfString inout;
|
||||||
|
inout.pZ = (unsigned char *)pData;
|
||||||
|
inout.c = 0;
|
||||||
|
inout.n = N;
|
||||||
|
|
||||||
|
if( inout.n<0 ){
|
||||||
|
inout.n = utf16Bytelen(inout.pZ);
|
||||||
|
}
|
||||||
|
|
||||||
|
if( readUtf16Bom(&inout)!=big_endian ){
|
||||||
|
swab(&inout.pZ[inout.c], inout.pZ, inout.n-inout.c);
|
||||||
|
}else if( inout.c ){
|
||||||
|
memmove(inout.pZ, &inout.pZ[inout.c], inout.n-inout.c);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@@ -57,21 +466,28 @@ unsigned char *sqlite3utf16to8(const void *pData, int N){
|
|||||||
** If the native byte order is little-endian and there is no BOM, then
|
** If the native byte order is little-endian and there is no BOM, then
|
||||||
** this routine is a no-op. If there is a BOM at the start of the string,
|
** this routine is a no-op. If there is a BOM at the start of the string,
|
||||||
** it is removed.
|
** it is removed.
|
||||||
*/
|
**
|
||||||
void sqlite3utf16to16le(void *pData, int N){
|
|
||||||
}
|
|
||||||
void sqlite3utf16to16be(void *pData, int N){
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
** Translation from UTF-16LE to UTF-16BE and back again is accomplished
|
** Translation from UTF-16LE to UTF-16BE and back again is accomplished
|
||||||
** using the library function swab().
|
** using the library function swab().
|
||||||
*/
|
*/
|
||||||
|
void sqlite3utf16to16le(void *pData, int N){
|
||||||
|
utf16to16(pData, N, 0);
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
** Translate UTF-8 to UTF-16BE or UTF-16LE
|
** Convert a string in UTF-16 native byte or with a BOM into a UTF-16BE
|
||||||
|
** string. The conversion occurs in-place. The output overwrites the
|
||||||
|
** input. N bytes are converted. If N is negative everything is converted
|
||||||
|
** up to the first \u0000 character.
|
||||||
|
**
|
||||||
|
** If the native byte order is little-endian and there is no BOM, then
|
||||||
|
** this routine is a no-op. If there is a BOM at the start of the string,
|
||||||
|
** it is removed.
|
||||||
|
**
|
||||||
|
** Translation from UTF-16LE to UTF-16BE and back again is accomplished
|
||||||
|
** using the library function swab().
|
||||||
*/
|
*/
|
||||||
void *sqlite3utf8to16be(const unsigned char *pIn, int N){
|
void sqlite3utf16to16be(void *pData, int N){
|
||||||
}
|
utf16to16(pData, N, 1);
|
||||||
void *sqlite3utf8to16le(const unsigned char *pIn, int N){
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user