mirror of
https://github.com/sqlite/sqlite.git
synced 2025-07-29 08:01:23 +03:00
Add an experimental tokenizer to fts4 - "unicode". This tokenizer works in the same way except that it understands unicode "simple case folding" and recognizes all characters not classified as "Letters" or "Numbers" by unicode as token separators.
FossilOrigin-Name: 0c13570ec78c6887103dc99b81b470829fa28385
This commit is contained in:
@ -3554,6 +3554,7 @@ static void hashDestroy(void *p){
|
|||||||
*/
|
*/
|
||||||
void sqlite3Fts3SimpleTokenizerModule(sqlite3_tokenizer_module const**ppModule);
|
void sqlite3Fts3SimpleTokenizerModule(sqlite3_tokenizer_module const**ppModule);
|
||||||
void sqlite3Fts3PorterTokenizerModule(sqlite3_tokenizer_module const**ppModule);
|
void sqlite3Fts3PorterTokenizerModule(sqlite3_tokenizer_module const**ppModule);
|
||||||
|
void sqlite3Fts3UnicodeTokenizer(sqlite3_tokenizer_module const**ppModule);
|
||||||
#ifdef SQLITE_ENABLE_ICU
|
#ifdef SQLITE_ENABLE_ICU
|
||||||
void sqlite3Fts3IcuTokenizerModule(sqlite3_tokenizer_module const**ppModule);
|
void sqlite3Fts3IcuTokenizerModule(sqlite3_tokenizer_module const**ppModule);
|
||||||
#endif
|
#endif
|
||||||
@ -3569,6 +3570,7 @@ int sqlite3Fts3Init(sqlite3 *db){
|
|||||||
Fts3Hash *pHash = 0;
|
Fts3Hash *pHash = 0;
|
||||||
const sqlite3_tokenizer_module *pSimple = 0;
|
const sqlite3_tokenizer_module *pSimple = 0;
|
||||||
const sqlite3_tokenizer_module *pPorter = 0;
|
const sqlite3_tokenizer_module *pPorter = 0;
|
||||||
|
const sqlite3_tokenizer_module *pUnicode = 0;
|
||||||
|
|
||||||
#ifdef SQLITE_ENABLE_ICU
|
#ifdef SQLITE_ENABLE_ICU
|
||||||
const sqlite3_tokenizer_module *pIcu = 0;
|
const sqlite3_tokenizer_module *pIcu = 0;
|
||||||
@ -3585,6 +3587,7 @@ int sqlite3Fts3Init(sqlite3 *db){
|
|||||||
|
|
||||||
sqlite3Fts3SimpleTokenizerModule(&pSimple);
|
sqlite3Fts3SimpleTokenizerModule(&pSimple);
|
||||||
sqlite3Fts3PorterTokenizerModule(&pPorter);
|
sqlite3Fts3PorterTokenizerModule(&pPorter);
|
||||||
|
sqlite3Fts3UnicodeTokenizer(&pUnicode);
|
||||||
|
|
||||||
/* Allocate and initialise the hash-table used to store tokenizers. */
|
/* Allocate and initialise the hash-table used to store tokenizers. */
|
||||||
pHash = sqlite3_malloc(sizeof(Fts3Hash));
|
pHash = sqlite3_malloc(sizeof(Fts3Hash));
|
||||||
@ -3598,6 +3601,7 @@ int sqlite3Fts3Init(sqlite3 *db){
|
|||||||
if( rc==SQLITE_OK ){
|
if( rc==SQLITE_OK ){
|
||||||
if( sqlite3Fts3HashInsert(pHash, "simple", 7, (void *)pSimple)
|
if( sqlite3Fts3HashInsert(pHash, "simple", 7, (void *)pSimple)
|
||||||
|| sqlite3Fts3HashInsert(pHash, "porter", 7, (void *)pPorter)
|
|| sqlite3Fts3HashInsert(pHash, "porter", 7, (void *)pPorter)
|
||||||
|
|| sqlite3Fts3HashInsert(pHash, "unicode", 8, (void *)pUnicode)
|
||||||
#ifdef SQLITE_ENABLE_ICU
|
#ifdef SQLITE_ENABLE_ICU
|
||||||
|| (pIcu && sqlite3Fts3HashInsert(pHash, "icu", 4, (void *)pIcu))
|
|| (pIcu && sqlite3Fts3HashInsert(pHash, "icu", 4, (void *)pIcu))
|
||||||
#endif
|
#endif
|
||||||
|
@ -541,5 +541,9 @@ int sqlite3Fts3MsrIncrRestart(Fts3MultiSegReader *pCsr);
|
|||||||
|
|
||||||
int sqlite3Fts3DeferredTokenList(Fts3DeferredToken *, char **, int *);
|
int sqlite3Fts3DeferredTokenList(Fts3DeferredToken *, char **, int *);
|
||||||
|
|
||||||
|
/* fts3_unicode2.c (functions generated by parsing unicode text files) */
|
||||||
|
int sqlite3FtsUnicodeTolower(int);
|
||||||
|
int sqlite3FtsUnicodeIsalnum(int);
|
||||||
|
|
||||||
#endif /* !SQLITE_CORE || SQLITE_ENABLE_FTS3 */
|
#endif /* !SQLITE_CORE || SQLITE_ENABLE_FTS3 */
|
||||||
#endif /* _FTSINT_H */
|
#endif /* _FTSINT_H */
|
||||||
|
242
ext/fts3/fts3_unicode.c
Normal file
242
ext/fts3/fts3_unicode.c
Normal file
@ -0,0 +1,242 @@
|
|||||||
|
/*
|
||||||
|
** 2012 May 24
|
||||||
|
**
|
||||||
|
** The author disclaims copyright to this source code. In place of
|
||||||
|
** a legal notice, here is a blessing:
|
||||||
|
**
|
||||||
|
** May you do good and not evil.
|
||||||
|
** May you find forgiveness for yourself and forgive others.
|
||||||
|
** May you share freely, never taking more than you give.
|
||||||
|
**
|
||||||
|
******************************************************************************
|
||||||
|
**
|
||||||
|
** Implementation of the "unicode" full-text-search tokenizer.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "fts3Int.h"
|
||||||
|
#if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3)
|
||||||
|
|
||||||
|
#include <assert.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <string.h>
|
||||||
|
|
||||||
|
#include "fts3_tokenizer.h"
|
||||||
|
|
||||||
|
/*
|
||||||
|
** The following two macros - READ_UTF8 and WRITE_UTF8 - have been copied
|
||||||
|
** from the sqlite3 source file utf.c. If this file is compiled as part
|
||||||
|
** of the amalgamation, they are not required.
|
||||||
|
*/
|
||||||
|
#ifndef SQLITE_AMALGAMATION
|
||||||
|
|
||||||
|
static const unsigned char sqlite3Utf8Trans1[] = {
|
||||||
|
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
|
||||||
|
0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
|
||||||
|
0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
|
||||||
|
0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
|
||||||
|
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
|
||||||
|
0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
|
||||||
|
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
|
||||||
|
0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x00, 0x00,
|
||||||
|
};
|
||||||
|
|
||||||
|
#define READ_UTF8(zIn, zTerm, c) \
|
||||||
|
c = *(zIn++); \
|
||||||
|
if( c>=0xc0 ){ \
|
||||||
|
c = sqlite3Utf8Trans1[c-0xc0]; \
|
||||||
|
while( zIn!=zTerm && (*zIn & 0xc0)==0x80 ){ \
|
||||||
|
c = (c<<6) + (0x3f & *(zIn++)); \
|
||||||
|
} \
|
||||||
|
if( c<0x80 \
|
||||||
|
|| (c&0xFFFFF800)==0xD800 \
|
||||||
|
|| (c&0xFFFFFFFE)==0xFFFE ){ c = 0xFFFD; } \
|
||||||
|
}
|
||||||
|
|
||||||
|
#define WRITE_UTF8(zOut, c) { \
|
||||||
|
if( c<0x00080 ){ \
|
||||||
|
*zOut++ = (u8)(c&0xFF); \
|
||||||
|
} \
|
||||||
|
else if( c<0x00800 ){ \
|
||||||
|
*zOut++ = 0xC0 + (u8)((c>>6)&0x1F); \
|
||||||
|
*zOut++ = 0x80 + (u8)(c & 0x3F); \
|
||||||
|
} \
|
||||||
|
else if( c<0x10000 ){ \
|
||||||
|
*zOut++ = 0xE0 + (u8)((c>>12)&0x0F); \
|
||||||
|
*zOut++ = 0x80 + (u8)((c>>6) & 0x3F); \
|
||||||
|
*zOut++ = 0x80 + (u8)(c & 0x3F); \
|
||||||
|
}else{ \
|
||||||
|
*zOut++ = 0xF0 + (u8)((c>>18) & 0x07); \
|
||||||
|
*zOut++ = 0x80 + (u8)((c>>12) & 0x3F); \
|
||||||
|
*zOut++ = 0x80 + (u8)((c>>6) & 0x3F); \
|
||||||
|
*zOut++ = 0x80 + (u8)(c & 0x3F); \
|
||||||
|
} \
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif /* ifndef SQLITE_AMALGAMATION */
|
||||||
|
|
||||||
|
typedef struct unicode_tokenizer unicode_tokenizer;
|
||||||
|
typedef struct unicode_cursor unicode_cursor;
|
||||||
|
|
||||||
|
struct unicode_tokenizer {
|
||||||
|
sqlite3_tokenizer base;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct unicode_cursor {
|
||||||
|
sqlite3_tokenizer_cursor base;
|
||||||
|
const unsigned char *aInput; /* Input text being tokenized */
|
||||||
|
int nInput; /* Size of aInput[] in bytes */
|
||||||
|
int iOff; /* Current offset within aInput[] */
|
||||||
|
int iToken; /* Index of next token to be returned */
|
||||||
|
char *zToken; /* storage for current token */
|
||||||
|
int nAlloc; /* space allocated at zToken */
|
||||||
|
};
|
||||||
|
|
||||||
|
/*
|
||||||
|
** Create a new tokenizer instance.
|
||||||
|
*/
|
||||||
|
static int unicodeCreate(
|
||||||
|
int nArg, /* Size of array argv[] */
|
||||||
|
const char * const *azArg, /* Tokenizer creation arguments */
|
||||||
|
sqlite3_tokenizer **pp /* OUT: New tokenizer handle */
|
||||||
|
){
|
||||||
|
unicode_tokenizer *pNew; /* New tokenizer object */
|
||||||
|
pNew = (unicode_tokenizer *) sqlite3_malloc(sizeof(unicode_tokenizer));
|
||||||
|
if( pNew==NULL ){
|
||||||
|
return SQLITE_NOMEM;
|
||||||
|
}
|
||||||
|
memset(pNew, 0, sizeof(unicode_tokenizer));
|
||||||
|
*pp = &pNew->base;
|
||||||
|
return SQLITE_OK;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
** Destroy a tokenizer allocated by unicodeCreate().
|
||||||
|
*/
|
||||||
|
static int unicodeDestroy(sqlite3_tokenizer *pTokenizer){
|
||||||
|
sqlite3_free(pTokenizer);
|
||||||
|
return SQLITE_OK;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
** Prepare to begin tokenizing a particular string. The input
|
||||||
|
** string to be tokenized is pInput[0..nBytes-1]. A cursor
|
||||||
|
** used to incrementally tokenize this string is returned in
|
||||||
|
** *ppCursor.
|
||||||
|
*/
|
||||||
|
static int unicodeOpen(
|
||||||
|
sqlite3_tokenizer *p, /* The tokenizer */
|
||||||
|
const char *aInput, /* Input string */
|
||||||
|
int nInput, /* Size of string aInput in bytes */
|
||||||
|
sqlite3_tokenizer_cursor **pp /* OUT: New cursor object */
|
||||||
|
){
|
||||||
|
unicode_cursor *pCsr;
|
||||||
|
|
||||||
|
pCsr = (unicode_cursor *)sqlite3_malloc(sizeof(unicode_cursor));
|
||||||
|
if( pCsr==0 ){
|
||||||
|
return SQLITE_NOMEM;
|
||||||
|
}
|
||||||
|
memset(pCsr, 0, sizeof(unicode_cursor));
|
||||||
|
|
||||||
|
pCsr->aInput = (const unsigned char *)aInput;
|
||||||
|
if( aInput==0 ){
|
||||||
|
pCsr->nInput = 0;
|
||||||
|
}else if( nInput<0 ){
|
||||||
|
pCsr->nInput = (int)strlen(aInput);
|
||||||
|
}else{
|
||||||
|
pCsr->nInput = nInput;
|
||||||
|
}
|
||||||
|
|
||||||
|
*pp = &pCsr->base;
|
||||||
|
UNUSED_PARAMETER(p);
|
||||||
|
return SQLITE_OK;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
** Close a tokenization cursor previously opened by a call to
|
||||||
|
** simpleOpen() above.
|
||||||
|
*/
|
||||||
|
static int unicodeClose(sqlite3_tokenizer_cursor *pCursor){
|
||||||
|
unicode_cursor *pCsr = (unicode_cursor *) pCursor;
|
||||||
|
sqlite3_free(pCsr->zToken);
|
||||||
|
sqlite3_free(pCsr);
|
||||||
|
return SQLITE_OK;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
** Extract the next token from a tokenization cursor. The cursor must
|
||||||
|
** have been opened by a prior call to simpleOpen().
|
||||||
|
*/
|
||||||
|
static int unicodeNext(
|
||||||
|
sqlite3_tokenizer_cursor *p, /* Cursor returned by simpleOpen */
|
||||||
|
const char **paToken, /* OUT: Token text */
|
||||||
|
int *pnToken, /* OUT: Number of bytes at *paToken */
|
||||||
|
int *piStart, /* OUT: Starting offset of token */
|
||||||
|
int *piEnd, /* OUT: Ending offset of token */
|
||||||
|
int *piPos /* OUT: Position integer of token */
|
||||||
|
){
|
||||||
|
unicode_cursor *pCsr = (unicode_cursor *)p;
|
||||||
|
int iCode;
|
||||||
|
char *zOut;
|
||||||
|
const unsigned char *z = &pCsr->aInput[pCsr->iOff];
|
||||||
|
const unsigned char *zStart = z;
|
||||||
|
const unsigned char *zEnd;
|
||||||
|
const unsigned char *zTerm = &pCsr->aInput[pCsr->nInput];
|
||||||
|
|
||||||
|
/* Scan past any delimiter characters before the start of the next token.
|
||||||
|
** Return SQLITE_DONE early if this takes us all the way to the end of
|
||||||
|
** the input. */
|
||||||
|
while( z<zTerm ){
|
||||||
|
READ_UTF8(z, zTerm, iCode);
|
||||||
|
if( sqlite3FtsUnicodeIsalnum(iCode) ) break;
|
||||||
|
zStart = z;
|
||||||
|
}
|
||||||
|
if( zStart>=zTerm ) return SQLITE_DONE;
|
||||||
|
|
||||||
|
zOut = pCsr->zToken;
|
||||||
|
do {
|
||||||
|
/* Grow the output buffer if required. */
|
||||||
|
if( (zOut-pCsr->zToken)>=(pCsr->nAlloc-4) ){
|
||||||
|
char *zNew = sqlite3_realloc(pCsr->zToken, pCsr->nAlloc+64);
|
||||||
|
if( !zNew ) return SQLITE_NOMEM;
|
||||||
|
zOut = &zNew[zOut - pCsr->zToken];
|
||||||
|
pCsr->zToken = zNew;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Write the folded case of the last character read to the output */
|
||||||
|
zEnd = z;
|
||||||
|
WRITE_UTF8(zOut, sqlite3FtsUnicodeTolower(iCode));
|
||||||
|
|
||||||
|
/* If the cursor is not at EOF, read the next character */
|
||||||
|
if( z>=zTerm ) break;
|
||||||
|
READ_UTF8(z, zTerm, iCode);
|
||||||
|
}while( sqlite3FtsUnicodeIsalnum(iCode) );
|
||||||
|
|
||||||
|
/* Set the output variables and return. */
|
||||||
|
pCsr->iOff = (z - pCsr->aInput);
|
||||||
|
*paToken = pCsr->zToken;
|
||||||
|
*pnToken = zOut - pCsr->zToken;
|
||||||
|
*piStart = (zStart - pCsr->aInput);
|
||||||
|
*piEnd = (zEnd - pCsr->aInput);
|
||||||
|
*piPos = pCsr->iToken++;
|
||||||
|
return SQLITE_OK;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
** Set *ppModule to a pointer to the sqlite3_tokenizer_module
|
||||||
|
** structure for the unicode tokenizer.
|
||||||
|
*/
|
||||||
|
void sqlite3Fts3UnicodeTokenizer(sqlite3_tokenizer_module const **ppModule){
|
||||||
|
static const sqlite3_tokenizer_module module = {
|
||||||
|
0,
|
||||||
|
unicodeCreate,
|
||||||
|
unicodeDestroy,
|
||||||
|
unicodeOpen,
|
||||||
|
unicodeClose,
|
||||||
|
unicodeNext,
|
||||||
|
0,
|
||||||
|
};
|
||||||
|
*ppModule = &module;
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3) */
|
230
ext/fts3/fts3_unicode2.c
Normal file
230
ext/fts3/fts3_unicode2.c
Normal file
@ -0,0 +1,230 @@
|
|||||||
|
/*
|
||||||
|
** DO NOT EDIT THIS MACHINE GENERATED FILE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <assert.h>
|
||||||
|
|
||||||
|
int sqlite3FtsUnicodeIsalnum(int c){
|
||||||
|
const static unsigned int aEntry[] = {
|
||||||
|
0x00000030, 0x0000E807, 0x00016C06, 0x0001EC2F, 0x0002AC07,
|
||||||
|
0x0002D001, 0x0002D803, 0x0002EC01, 0x0002FC01, 0x00035C01,
|
||||||
|
0x0003DC01, 0x000B0804, 0x000B480E, 0x000B9407, 0x000BB401,
|
||||||
|
0x000BBC81, 0x000DD401, 0x000DF801, 0x000E1002, 0x000E1C01,
|
||||||
|
0x000FD801, 0x00120808, 0x00156806, 0x00162402, 0x00163C01,
|
||||||
|
0x00164437, 0x0017CC02, 0x00180005, 0x00181816, 0x00187802,
|
||||||
|
0x00192C15, 0x0019A804, 0x0019C001, 0x001B5001, 0x001B580F,
|
||||||
|
0x001B9C07, 0x001BF402, 0x001C000E, 0x001C3C01, 0x001C4401,
|
||||||
|
0x001CC01B, 0x001E980B, 0x001FAC09, 0x001FD804, 0x00205804,
|
||||||
|
0x00206C09, 0x00209403, 0x0020A405, 0x0020C00F, 0x00216403,
|
||||||
|
0x00217801, 0x0023901B, 0x00240004, 0x0024E803, 0x0024F812,
|
||||||
|
0x00254407, 0x00258804, 0x0025C001, 0x00260403, 0x0026F001,
|
||||||
|
0x0026F807, 0x00271C02, 0x00272C03, 0x00275C01, 0x00278802,
|
||||||
|
0x0027C802, 0x0027E802, 0x00280403, 0x0028F001, 0x0028F805,
|
||||||
|
0x00291C02, 0x00292C03, 0x00294401, 0x0029C002, 0x0029D401,
|
||||||
|
0x002A0403, 0x002AF001, 0x002AF808, 0x002B1C03, 0x002B2C03,
|
||||||
|
0x002B8802, 0x002BC002, 0x002C0403, 0x002CF001, 0x002CF807,
|
||||||
|
0x002D1C02, 0x002D2C03, 0x002D5802, 0x002D8802, 0x002DC001,
|
||||||
|
0x002E0801, 0x002EF805, 0x002F1803, 0x002F2804, 0x002F5C01,
|
||||||
|
0x002FCC08, 0x00300403, 0x0030F807, 0x00311803, 0x00312804,
|
||||||
|
0x00315402, 0x00318802, 0x0031FC01, 0x00320802, 0x0032F001,
|
||||||
|
0x0032F807, 0x00331803, 0x00332804, 0x00335402, 0x00338802,
|
||||||
|
0x00340802, 0x0034F807, 0x00351803, 0x00352804, 0x00355C01,
|
||||||
|
0x00358802, 0x0035E401, 0x00360802, 0x00372801, 0x00373C06,
|
||||||
|
0x00375801, 0x00376008, 0x0037C803, 0x0038C401, 0x0038D007,
|
||||||
|
0x0038FC01, 0x00391C09, 0x00396802, 0x003AC401, 0x003AD006,
|
||||||
|
0x003AEC02, 0x003B2006, 0x003C041F, 0x003CD00C, 0x003DC417,
|
||||||
|
0x003E340B, 0x003E6424, 0x003EF80F, 0x003F380D, 0x0040AC14,
|
||||||
|
0x00412806, 0x00415804, 0x00417803, 0x00418803, 0x00419C07,
|
||||||
|
0x0041C404, 0x0042080C, 0x00423C01, 0x00426806, 0x0043EC01,
|
||||||
|
0x004D740C, 0x004E400A, 0x00500001, 0x0059B402, 0x005A0001,
|
||||||
|
0x005A6C02, 0x005BAC03, 0x005C4803, 0x005CC805, 0x005D4802,
|
||||||
|
0x005DC802, 0x005ED023, 0x005F6004, 0x005F7401, 0x0060000F,
|
||||||
|
0x0062A401, 0x0064800C, 0x0064C00C, 0x00650001, 0x00651002,
|
||||||
|
0x0066C011, 0x00672002, 0x00677822, 0x00685C05, 0x00687802,
|
||||||
|
0x0069540A, 0x0069801D, 0x0069FC01, 0x006A8007, 0x006AA006,
|
||||||
|
0x006C0005, 0x006CD011, 0x006D6823, 0x006E0003, 0x006E840D,
|
||||||
|
0x006F980E, 0x006FF004, 0x00709014, 0x0070EC05, 0x0071F802,
|
||||||
|
0x00730008, 0x00734019, 0x0073B401, 0x0073C803, 0x00770027,
|
||||||
|
0x0077F004, 0x007EF401, 0x007EFC03, 0x007F3403, 0x007F7403,
|
||||||
|
0x007FB403, 0x007FF402, 0x00800065, 0x0081A806, 0x0081E805,
|
||||||
|
0x00822805, 0x0082801A, 0x00834021, 0x00840002, 0x00840C04,
|
||||||
|
0x00842002, 0x00845001, 0x00845803, 0x00847806, 0x00849401,
|
||||||
|
0x00849C01, 0x0084A401, 0x0084B801, 0x0084E802, 0x00850005,
|
||||||
|
0x00852804, 0x00853C01, 0x00864264, 0x00900027, 0x0091000B,
|
||||||
|
0x0092704E, 0x00940200, 0x009C0475, 0x009E53B9, 0x00AD400A,
|
||||||
|
0x00B39406, 0x00B3BC03, 0x00B3E404, 0x00B3F802, 0x00B5C001,
|
||||||
|
0x00B5FC01, 0x00B7804F, 0x00B8C00C, 0x00BA001A, 0x00BA6C59,
|
||||||
|
0x00BC00D6, 0x00BFC00C, 0x00C00005, 0x00C02019, 0x00C0A807,
|
||||||
|
0x00C0D802, 0x00C0F403, 0x00C26404, 0x00C28001, 0x00C3EC01,
|
||||||
|
0x00C64002, 0x00C6580A, 0x00C70024, 0x00C8001F, 0x00C8A81E,
|
||||||
|
0x00C94001, 0x00C98020, 0x00CA2827, 0x00CB003F, 0x00CC0100,
|
||||||
|
0x01370040, 0x02924037, 0x0293F802, 0x02983403, 0x0299BC10,
|
||||||
|
0x029A7C01, 0x029BC008, 0x029C0017, 0x029C8002, 0x029E2402,
|
||||||
|
0x02A00801, 0x02A01801, 0x02A02C01, 0x02A08C09, 0x02A0D804,
|
||||||
|
0x02A1D004, 0x02A20002, 0x02A2D011, 0x02A33802, 0x02A38012,
|
||||||
|
0x02A3E003, 0x02A4980A, 0x02A51C0D, 0x02A57C01, 0x02A60004,
|
||||||
|
0x02A6CC1B, 0x02A77802, 0x02A8A40E, 0x02A90C01, 0x02A93002,
|
||||||
|
0x02A97004, 0x02A9DC03, 0x02A9EC01, 0x02AAC001, 0x02AAC803,
|
||||||
|
0x02AADC02, 0x02AAF802, 0x02AB0401, 0x02AB7802, 0x02ABAC07,
|
||||||
|
0x02ABD402, 0x02AF8C0B, 0x03600001, 0x036DFC02, 0x036FFC02,
|
||||||
|
0x037FFC02, 0x03E3FC01, 0x03EC7801, 0x03ECA401, 0x03EEC810,
|
||||||
|
0x03F4F802, 0x03F7F002, 0x03F8001A, 0x03F88007, 0x03F8C023,
|
||||||
|
0x03F95013, 0x03F9A004, 0x03FBFC01, 0x03FC040F, 0x03FC6807,
|
||||||
|
0x03FCEC06, 0x03FD6C0B, 0x03FF8007, 0x03FFA007, 0x03FFE405,
|
||||||
|
0x04040003, 0x0404DC09, 0x0405E411, 0x0406400C, 0x0407402E,
|
||||||
|
0x040E7C01, 0x040F4001, 0x04215C01, 0x04247C01, 0x0424FC01,
|
||||||
|
0x04280403, 0x04281402, 0x04283004, 0x0428E003, 0x0428FC01,
|
||||||
|
0x04294009, 0x0429FC01, 0x042CE407, 0x04400003, 0x0440E016,
|
||||||
|
0x04420003, 0x0442C012, 0x04440003, 0x04449C0E, 0x04450004,
|
||||||
|
0x04460003, 0x0446CC0E, 0x04471404, 0x045AAC0D, 0x0491C004,
|
||||||
|
0x05BD442E, 0x05BE3C04, 0x074000F6, 0x07440027, 0x0744A4B5,
|
||||||
|
0x07480046, 0x074C0057, 0x075B0401, 0x075B6C01, 0x075BEC01,
|
||||||
|
0x075C5401, 0x075CD401, 0x075D3C01, 0x075DBC01, 0x075E2401,
|
||||||
|
0x075EA401, 0x075F0C01, 0x07BBC002, 0x07C0002C, 0x07C0C064,
|
||||||
|
0x07C2800F, 0x07C2C40E, 0x07C3040F, 0x07C3440F, 0x07C4401F,
|
||||||
|
0x07C4C03C, 0x07C5C02B, 0x07C7981D, 0x07C8402B, 0x07C90009,
|
||||||
|
0x07C94002, 0x07CC0021, 0x07CCC006, 0x07CCDC46, 0x07CE0014,
|
||||||
|
0x07CE8025, 0x07CF1805, 0x07CF8011, 0x07D0003F, 0x07D10001,
|
||||||
|
0x07D108B6, 0x07D3E404, 0x07D4003E, 0x07D50004, 0x07D54018,
|
||||||
|
0x07D7EC46, 0x07D9140B, 0x07DA0046, 0x07DC0074, 0x38000401,
|
||||||
|
0x38008060, 0x380400F0, 0x3C000001, 0x3FFFF401, 0x40000001,
|
||||||
|
0x43FFF401,
|
||||||
|
};
|
||||||
|
|
||||||
|
if( c<(1<<22) ){
|
||||||
|
unsigned int key = (((unsigned int)c)<<10) | 0x000003FF;
|
||||||
|
int iRes;
|
||||||
|
int iHi = sizeof(aEntry)/sizeof(aEntry[0]) - 1;
|
||||||
|
int iLo = 0;
|
||||||
|
while( iHi>=iLo ){
|
||||||
|
int iTest = (iHi + iLo) / 2;
|
||||||
|
if( key >= aEntry[iTest] ){
|
||||||
|
iRes = iTest;
|
||||||
|
iLo = iTest+1;
|
||||||
|
}else{
|
||||||
|
iHi = iTest-1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
assert( aEntry[0]<key );
|
||||||
|
assert( key>=aEntry[iRes] );
|
||||||
|
return (c >= ((aEntry[iRes]>>10) + (aEntry[iRes]&0x3FF)));
|
||||||
|
}
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
int sqlite3FtsUnicodeTolower(int c){
|
||||||
|
/* Each entry in the following array defines a rule for folding a range
|
||||||
|
** of codepoints to lower case. The rule applies to a range of nRange
|
||||||
|
** codepoints starting at codepoint iCode.
|
||||||
|
**
|
||||||
|
** If bFlag is clear, then all the codepoints in the range are upper
|
||||||
|
** case and require folding. Or, if bFlag is set, then only every second
|
||||||
|
** codepoint in the range, starting with iCode, requires folding. If a
|
||||||
|
** specific codepoint C does require folding, then the lower-case version
|
||||||
|
** is ((C + iOff)&0xFFFF).
|
||||||
|
**
|
||||||
|
** The contents of this array are generated by parsing the CaseFolding.txt
|
||||||
|
** file distributed as part of the "Unicode Character Database". See
|
||||||
|
** http://www.unicode.org for details.
|
||||||
|
*/
|
||||||
|
static const struct TableEntry {
|
||||||
|
unsigned short iCode;
|
||||||
|
unsigned char bFlag;
|
||||||
|
unsigned char nRange;
|
||||||
|
unsigned short iOff;
|
||||||
|
} aEntry[] = {
|
||||||
|
{65, 0, 26, 32}, {181, 0, 1, 775}, {192, 0, 23, 32},
|
||||||
|
{216, 0, 7, 32}, {256, 1, 48, 1}, {306, 1, 6, 1},
|
||||||
|
{313, 1, 16, 1}, {330, 1, 46, 1}, {376, 0, 1, 65415},
|
||||||
|
{377, 1, 6, 1}, {383, 0, 1, 65268}, {385, 0, 1, 210},
|
||||||
|
{386, 1, 4, 1}, {390, 0, 1, 206}, {391, 0, 1, 1},
|
||||||
|
{393, 0, 2, 205}, {395, 0, 1, 1}, {398, 0, 1, 79},
|
||||||
|
{399, 0, 1, 202}, {400, 0, 1, 203}, {401, 0, 1, 1},
|
||||||
|
{403, 0, 1, 205}, {404, 0, 1, 207}, {406, 0, 1, 211},
|
||||||
|
{407, 0, 1, 209}, {408, 0, 1, 1}, {412, 0, 1, 211},
|
||||||
|
{413, 0, 1, 213}, {415, 0, 1, 214}, {416, 1, 6, 1},
|
||||||
|
{422, 0, 1, 218}, {423, 0, 1, 1}, {425, 0, 1, 218},
|
||||||
|
{428, 0, 1, 1}, {430, 0, 1, 218}, {431, 0, 1, 1},
|
||||||
|
{433, 0, 2, 217}, {435, 1, 4, 1}, {439, 0, 1, 219},
|
||||||
|
{440, 0, 1, 1}, {444, 0, 1, 1}, {452, 0, 1, 2},
|
||||||
|
{453, 0, 1, 1}, {455, 0, 1, 2}, {456, 0, 1, 1},
|
||||||
|
{458, 0, 1, 2}, {459, 1, 18, 1}, {478, 1, 18, 1},
|
||||||
|
{497, 0, 1, 2}, {498, 1, 4, 1}, {502, 0, 1, 65439},
|
||||||
|
{503, 0, 1, 65480}, {504, 1, 40, 1}, {544, 0, 1, 65406},
|
||||||
|
{546, 1, 18, 1}, {570, 0, 1, 10795}, {571, 0, 1, 1},
|
||||||
|
{573, 0, 1, 65373}, {574, 0, 1, 10792}, {577, 0, 1, 1},
|
||||||
|
{579, 0, 1, 65341}, {580, 0, 1, 69}, {581, 0, 1, 71},
|
||||||
|
{582, 1, 10, 1}, {837, 0, 1, 116}, {880, 1, 4, 1},
|
||||||
|
{886, 0, 1, 1}, {902, 0, 1, 38}, {904, 0, 3, 37},
|
||||||
|
{908, 0, 1, 64}, {910, 0, 2, 63}, {913, 0, 17, 32},
|
||||||
|
{931, 0, 9, 32}, {962, 0, 1, 1}, {975, 0, 1, 8},
|
||||||
|
{976, 0, 1, 65506}, {977, 0, 1, 65511}, {981, 0, 1, 65521},
|
||||||
|
{982, 0, 1, 65514}, {984, 1, 24, 1}, {1008, 0, 1, 65482},
|
||||||
|
{1009, 0, 1, 65488}, {1012, 0, 1, 65476}, {1013, 0, 1, 65472},
|
||||||
|
{1015, 0, 1, 1}, {1017, 0, 1, 65529}, {1018, 0, 1, 1},
|
||||||
|
{1021, 0, 3, 65406}, {1024, 0, 16, 80}, {1040, 0, 32, 32},
|
||||||
|
{1120, 1, 34, 1}, {1162, 1, 54, 1}, {1216, 0, 1, 15},
|
||||||
|
{1217, 1, 14, 1}, {1232, 1, 88, 1}, {1329, 0, 38, 48},
|
||||||
|
{4256, 0, 38, 7264}, {4295, 0, 1, 7264}, {4301, 0, 1, 7264},
|
||||||
|
{7680, 1, 150, 1}, {7835, 0, 1, 65478}, {7838, 0, 1, 57921},
|
||||||
|
{7840, 1, 96, 1}, {7944, 0, 8, 65528}, {7960, 0, 6, 65528},
|
||||||
|
{7976, 0, 8, 65528}, {7992, 0, 8, 65528}, {8008, 0, 6, 65528},
|
||||||
|
{8025, 1, 8, 65528}, {8040, 0, 8, 65528}, {8072, 0, 8, 65528},
|
||||||
|
{8088, 0, 8, 65528}, {8104, 0, 8, 65528}, {8120, 0, 2, 65528},
|
||||||
|
{8122, 0, 2, 65462}, {8124, 0, 1, 65527}, {8126, 0, 1, 58363},
|
||||||
|
{8136, 0, 4, 65450}, {8140, 0, 1, 65527}, {8152, 0, 2, 65528},
|
||||||
|
{8154, 0, 2, 65436}, {8168, 0, 2, 65528}, {8170, 0, 2, 65424},
|
||||||
|
{8172, 0, 1, 65529}, {8184, 0, 2, 65408}, {8186, 0, 2, 65410},
|
||||||
|
{8188, 0, 1, 65527}, {8486, 0, 1, 58019}, {8490, 0, 1, 57153},
|
||||||
|
{8491, 0, 1, 57274}, {8498, 0, 1, 28}, {8544, 0, 16, 16},
|
||||||
|
{8579, 0, 1, 1}, {9398, 0, 26, 26}, {11264, 0, 47, 48},
|
||||||
|
{11360, 0, 1, 1}, {11362, 0, 1, 54793}, {11363, 0, 1, 61722},
|
||||||
|
{11364, 0, 1, 54809}, {11367, 1, 6, 1}, {11373, 0, 1, 54756},
|
||||||
|
{11374, 0, 1, 54787}, {11375, 0, 1, 54753}, {11376, 0, 1, 54754},
|
||||||
|
{11378, 0, 1, 1}, {11381, 0, 1, 1}, {11390, 0, 2, 54721},
|
||||||
|
{11392, 1, 100, 1}, {11499, 1, 4, 1}, {11506, 0, 1, 1},
|
||||||
|
{42560, 1, 46, 1}, {42624, 1, 24, 1}, {42786, 1, 14, 1},
|
||||||
|
{42802, 1, 62, 1}, {42873, 1, 4, 1}, {42877, 0, 1, 30204},
|
||||||
|
{42878, 1, 10, 1}, {42891, 0, 1, 1}, {42893, 0, 1, 23256},
|
||||||
|
{42896, 1, 4, 1}, {42912, 1, 10, 1}, {42922, 0, 1, 23228},
|
||||||
|
{65313, 0, 26, 32},
|
||||||
|
};
|
||||||
|
|
||||||
|
int ret = c;
|
||||||
|
|
||||||
|
assert( c>=0 );
|
||||||
|
assert( sizeof(unsigned short)==2 && sizeof(unsigned char)==1 );
|
||||||
|
|
||||||
|
if( c<65536 ){
|
||||||
|
int iHi = sizeof(aEntry)/sizeof(aEntry[0]) - 1;
|
||||||
|
int iLo = 0;
|
||||||
|
int iRes = -1;
|
||||||
|
|
||||||
|
while( iHi>=iLo ){
|
||||||
|
int iTest = (iHi + iLo) / 2;
|
||||||
|
int cmp = (c - aEntry[iTest].iCode);
|
||||||
|
if( cmp>=0 ){
|
||||||
|
iRes = iTest;
|
||||||
|
iLo = iTest+1;
|
||||||
|
}else{
|
||||||
|
iHi = iTest-1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
assert( iRes<0 || c>=aEntry[iRes].iCode );
|
||||||
|
|
||||||
|
if( iRes>=0 ){
|
||||||
|
const struct TableEntry *p = &aEntry[iRes];
|
||||||
|
if( c<(p->iCode + p->nRange) && 0==(p->bFlag & (p->iCode ^ c)) ){
|
||||||
|
ret = (c + p->iOff) & 0x0000FFFF;
|
||||||
|
assert( ret>0 );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
else if( c>=66560 && c<66600 ){
|
||||||
|
ret = c + 40;
|
||||||
|
}
|
||||||
|
|
||||||
|
return ret;
|
||||||
|
}
|
1224
ext/fts3/unicode/CaseFolding.txt
Normal file
1224
ext/fts3/unicode/CaseFolding.txt
Normal file
File diff suppressed because it is too large
Load Diff
24428
ext/fts3/unicode/UnicodeData.txt
Normal file
24428
ext/fts3/unicode/UnicodeData.txt
Normal file
File diff suppressed because it is too large
Load Diff
466
ext/fts3/unicode/mkunicode.tcl
Normal file
466
ext/fts3/unicode/mkunicode.tcl
Normal file
@ -0,0 +1,466 @@
|
|||||||
|
|
||||||
|
|
||||||
|
# Parameter $zName must be a path to the file UnicodeData.txt. This command
|
||||||
|
# reads the file and returns a list of codepoints (integers). The list
|
||||||
|
# contains all codepoints in the UnicodeData.txt assigned to any "General
|
||||||
|
# Category" that is not a "Letter" or "Number".
|
||||||
|
#
|
||||||
|
proc an_load_unicodedata_text {zName} {
|
||||||
|
set fd [open $zName]
|
||||||
|
set lField {
|
||||||
|
code
|
||||||
|
character_name
|
||||||
|
general_category
|
||||||
|
canonical_combining_classes
|
||||||
|
bidirectional_category
|
||||||
|
character_decomposition_mapping
|
||||||
|
decimal_digit_value
|
||||||
|
digit_value
|
||||||
|
numeric_value
|
||||||
|
mirrored
|
||||||
|
unicode_1_name
|
||||||
|
iso10646_comment_field
|
||||||
|
uppercase_mapping
|
||||||
|
lowercase_mapping
|
||||||
|
titlecase_mapping
|
||||||
|
}
|
||||||
|
set lRet [list]
|
||||||
|
|
||||||
|
while { ![eof $fd] } {
|
||||||
|
set line [gets $fd]
|
||||||
|
if {$line == ""} continue
|
||||||
|
|
||||||
|
set fields [split $line ";"]
|
||||||
|
if {[llength $fields] != [llength $lField]} { error "parse error: $line" }
|
||||||
|
foreach $lField $fields {}
|
||||||
|
|
||||||
|
set iCode [expr "0x$code"]
|
||||||
|
set bAlnum [expr {[lsearch {L N} [string range $general_category 0 0]]>=0}]
|
||||||
|
|
||||||
|
if { !$bAlnum } { lappend lRet $iCode }
|
||||||
|
}
|
||||||
|
|
||||||
|
close $fd
|
||||||
|
set lRet
|
||||||
|
}
|
||||||
|
|
||||||
|
proc an_load_separator_ranges {} {
|
||||||
|
global unicodedata.txt
|
||||||
|
set lSep [an_load_unicodedata_text ${unicodedata.txt}]
|
||||||
|
unset -nocomplain iFirst
|
||||||
|
unset -nocomplain nRange
|
||||||
|
set lRange [list]
|
||||||
|
foreach sep $lSep {
|
||||||
|
if {0==[info exists iFirst]} {
|
||||||
|
set iFirst $sep
|
||||||
|
set nRange 1
|
||||||
|
} elseif { $sep == ($iFirst+$nRange) } {
|
||||||
|
incr nRange
|
||||||
|
} else {
|
||||||
|
lappend lRange [list $iFirst $nRange]
|
||||||
|
set iFirst $sep
|
||||||
|
set nRange 1
|
||||||
|
}
|
||||||
|
}
|
||||||
|
lappend lRange [list $iFirst $nRange]
|
||||||
|
set lRange
|
||||||
|
}
|
||||||
|
|
||||||
|
proc an_print_range_array {lRange} {
|
||||||
|
set iFirstMax 0
|
||||||
|
set nRangeMax 0
|
||||||
|
foreach range $lRange {
|
||||||
|
foreach {iFirst nRange} $range {}
|
||||||
|
if {$iFirst > $iFirstMax} {set iFirstMax $iFirst}
|
||||||
|
if {$nRange > $nRangeMax} {set nRangeMax $nRange}
|
||||||
|
}
|
||||||
|
if {$iFirstMax >= (1<<22)} {error "first-max is too large for format"}
|
||||||
|
if {$nRangeMax >= (1<<10)} {error "range-max is too large for format"}
|
||||||
|
|
||||||
|
puts -nonewline " const static unsigned int aEntry\[\] = \{"
|
||||||
|
set i 0
|
||||||
|
foreach range $lRange {
|
||||||
|
foreach {iFirst nRange} $range {}
|
||||||
|
set u32 [format "0x%08X" [expr ($iFirst<<10) + $nRange]]
|
||||||
|
|
||||||
|
if {($i % 5)==0} {puts "" ; puts -nonewline " "}
|
||||||
|
puts -nonewline " $u32,"
|
||||||
|
incr i
|
||||||
|
}
|
||||||
|
puts ""
|
||||||
|
puts " \};"
|
||||||
|
}
|
||||||
|
|
||||||
|
proc print_isalnum {zFunc lRange} {
|
||||||
|
puts "int ${zFunc}\(int c)\{"
|
||||||
|
an_print_range_array $lRange
|
||||||
|
puts {
|
||||||
|
if( c<(1<<22) ){
|
||||||
|
unsigned int key = (((unsigned int)c)<<10) | 0x000003FF;
|
||||||
|
int iRes;
|
||||||
|
int iHi = sizeof(aEntry)/sizeof(aEntry[0]) - 1;
|
||||||
|
int iLo = 0;
|
||||||
|
while( iHi>=iLo ){
|
||||||
|
int iTest = (iHi + iLo) / 2;
|
||||||
|
if( key >= aEntry[iTest] ){
|
||||||
|
iRes = iTest;
|
||||||
|
iLo = iTest+1;
|
||||||
|
}else{
|
||||||
|
iHi = iTest-1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
assert( aEntry[0]<key );
|
||||||
|
assert( key>=aEntry[iRes] );
|
||||||
|
return (c >= ((aEntry[iRes]>>10) + (aEntry[iRes]&0x3FF)));
|
||||||
|
}
|
||||||
|
return 1;}
|
||||||
|
puts "\}"
|
||||||
|
}
|
||||||
|
|
||||||
|
proc print_test_isalnum {zFunc lRange} {
|
||||||
|
foreach range $lRange {
|
||||||
|
foreach {iFirst nRange} $range {}
|
||||||
|
for {set i $iFirst} {$i < ($iFirst+$nRange)} {incr i} { set a($i) 1 }
|
||||||
|
}
|
||||||
|
|
||||||
|
puts "static int isalnum_test(int *piCode)\{"
|
||||||
|
puts -nonewline " unsigned char aAlnum\[\] = \{"
|
||||||
|
for {set i 0} {$i < 70000} {incr i} {
|
||||||
|
if {($i % 32)==0} { puts "" ; puts -nonewline " " }
|
||||||
|
set bFlag [expr ![info exists a($i)]]
|
||||||
|
puts -nonewline "${bFlag},"
|
||||||
|
}
|
||||||
|
puts ""
|
||||||
|
puts " \};"
|
||||||
|
|
||||||
|
puts -nonewline " int aLargeSep\[\] = \{"
|
||||||
|
set i 0
|
||||||
|
foreach iSep [lsort -integer [array names a]] {
|
||||||
|
if {$iSep<70000} continue
|
||||||
|
if {($i % 8)==0} { puts "" ; puts -nonewline " " }
|
||||||
|
puts -nonewline " $iSep,"
|
||||||
|
incr i
|
||||||
|
}
|
||||||
|
puts ""
|
||||||
|
puts " \};"
|
||||||
|
puts -nonewline " int aLargeOther\[\] = \{"
|
||||||
|
set i 0
|
||||||
|
foreach iSep [lsort -integer [array names a]] {
|
||||||
|
if {$iSep<70000} continue
|
||||||
|
if {[info exists a([expr $iSep-1])]==0} {
|
||||||
|
if {($i % 8)==0} { puts "" ; puts -nonewline " " }
|
||||||
|
puts -nonewline " [expr $iSep-1],"
|
||||||
|
incr i
|
||||||
|
}
|
||||||
|
if {[info exists a([expr $iSep+1])]==0} {
|
||||||
|
if {($i % 8)==0} { puts "" ; puts -nonewline " " }
|
||||||
|
puts -nonewline " [expr $iSep+1],"
|
||||||
|
incr i
|
||||||
|
}
|
||||||
|
}
|
||||||
|
puts ""
|
||||||
|
puts " \};"
|
||||||
|
|
||||||
|
puts [subst -nocommands {
|
||||||
|
int i;
|
||||||
|
for(i=0; i<sizeof(aAlnum)/sizeof(aAlnum[0]); i++){
|
||||||
|
if( ${zFunc}(i)!=aAlnum[i] ){
|
||||||
|
*piCode = i;
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for(i=0; i<sizeof(aLargeSep)/sizeof(aLargeSep[0]); i++){
|
||||||
|
if( ${zFunc}(aLargeSep[i])!=0 ){
|
||||||
|
*piCode = aLargeSep[i];
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for(i=0; i<sizeof(aLargeOther)/sizeof(aLargeOther[0]); i++){
|
||||||
|
if( ${zFunc}(aLargeOther[i])!=1 ){
|
||||||
|
*piCode = aLargeOther[i];
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}]
|
||||||
|
puts " return 0;"
|
||||||
|
puts "\}"
|
||||||
|
}
|
||||||
|
|
||||||
|
#-------------------------------------------------------------------------
|
||||||
|
|
||||||
|
proc tl_load_casefolding_txt {zName} {
|
||||||
|
global tl_lookup_table
|
||||||
|
|
||||||
|
set fd [open $zName]
|
||||||
|
while { ![eof $fd] } {
|
||||||
|
set line [gets $fd]
|
||||||
|
if {[string range $line 0 0] == "#"} continue
|
||||||
|
if {$line == ""} continue
|
||||||
|
|
||||||
|
foreach x {a b c d} {unset -nocomplain $x}
|
||||||
|
foreach {a b c d} [split $line ";"] {}
|
||||||
|
|
||||||
|
set a2 [list]
|
||||||
|
set c2 [list]
|
||||||
|
foreach elem $a { lappend a2 [expr "0x[string trim $elem]"] }
|
||||||
|
foreach elem $c { lappend c2 [expr "0x[string trim $elem]"] }
|
||||||
|
set b [string trim $b]
|
||||||
|
set d [string trim $d]
|
||||||
|
|
||||||
|
if {$b=="C" || $b=="S"} { set tl_lookup_table($a2) $c2 }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
proc tl_create_records {} {
|
||||||
|
global tl_lookup_table
|
||||||
|
|
||||||
|
set iFirst ""
|
||||||
|
set nOff 0
|
||||||
|
set nRange 0
|
||||||
|
set nIncr 0
|
||||||
|
|
||||||
|
set lRecord [list]
|
||||||
|
foreach code [lsort -integer [array names tl_lookup_table]] {
|
||||||
|
set mapping $tl_lookup_table($code)
|
||||||
|
if {$iFirst == ""} {
|
||||||
|
set iFirst $code
|
||||||
|
set nOff [expr $mapping - $code]
|
||||||
|
set nRange 1
|
||||||
|
set nIncr 1
|
||||||
|
} else {
|
||||||
|
set diff [expr $code - ($iFirst + ($nIncr * ($nRange - 1)))]
|
||||||
|
if { $nRange==1 && ($diff==1 || $diff==2) } {
|
||||||
|
set nIncr $diff
|
||||||
|
}
|
||||||
|
|
||||||
|
if {$diff != $nIncr || ($mapping - $code)!=$nOff} {
|
||||||
|
if { $nRange==1 } {set nIncr 1}
|
||||||
|
lappend lRecord [list $iFirst $nIncr $nRange $nOff]
|
||||||
|
set iFirst $code
|
||||||
|
set nOff [expr $mapping - $code]
|
||||||
|
set nRange 1
|
||||||
|
set nIncr 1
|
||||||
|
} else {
|
||||||
|
incr nRange
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
lappend lRecord [list $iFirst $nIncr $nRange $nOff]
|
||||||
|
|
||||||
|
set lRecord
|
||||||
|
}
|
||||||
|
|
||||||
|
proc tl_print_table_header {} {
|
||||||
|
puts -nonewline " "
|
||||||
|
puts [string trim {
|
||||||
|
/* Each entry in the following array defines a rule for folding a range
|
||||||
|
** of codepoints to lower case. The rule applies to a range of nRange
|
||||||
|
** codepoints starting at codepoint iCode.
|
||||||
|
**
|
||||||
|
** If bFlag is clear, then all the codepoints in the range are upper
|
||||||
|
** case and require folding. Or, if bFlag is set, then only every second
|
||||||
|
** codepoint in the range, starting with iCode, requires folding. If a
|
||||||
|
** specific codepoint C does require folding, then the lower-case version
|
||||||
|
** is ((C + iOff)&0xFFFF).
|
||||||
|
**
|
||||||
|
** The contents of this array are generated by parsing the CaseFolding.txt
|
||||||
|
** file distributed as part of the "Unicode Character Database". See
|
||||||
|
** http://www.unicode.org for details.
|
||||||
|
*/
|
||||||
|
}]
|
||||||
|
puts " static const struct TableEntry \{"
|
||||||
|
puts " unsigned short iCode;"
|
||||||
|
puts " unsigned char bFlag;"
|
||||||
|
puts " unsigned char nRange;"
|
||||||
|
puts " unsigned short iOff;"
|
||||||
|
puts " \} aEntry\[\] = \{"
|
||||||
|
}
|
||||||
|
|
||||||
|
proc tl_print_table_entry {togglevar entry} {
|
||||||
|
upvar $togglevar t
|
||||||
|
foreach {iFirst nIncr nRange nOff} $entry {}
|
||||||
|
|
||||||
|
if {$iFirst > (1<<16)} { return 1 }
|
||||||
|
|
||||||
|
if {[info exists t]==0} {set t 0}
|
||||||
|
if {$t==0} { puts -nonewline " " }
|
||||||
|
|
||||||
|
set flags 0
|
||||||
|
if {$nIncr==2} { set flags 1 ; set nRange [expr $nRange * 2]}
|
||||||
|
if {$nOff<0} { incr nOff [expr (1<<16)] }
|
||||||
|
|
||||||
|
set txt "{$iFirst, $flags, $nRange, $nOff},"
|
||||||
|
if {$t==2} {
|
||||||
|
puts $txt
|
||||||
|
} else {
|
||||||
|
puts -nonewline [format "% -23s" $txt]
|
||||||
|
}
|
||||||
|
set t [expr ($t+1)%3]
|
||||||
|
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
|
proc tl_print_table_footer {togglevar} {
|
||||||
|
upvar $togglevar t
|
||||||
|
if {$t!=0} {puts ""}
|
||||||
|
puts " \};"
|
||||||
|
}
|
||||||
|
|
||||||
|
proc tl_print_if_entry {entry} {
|
||||||
|
foreach {iFirst nIncr nRange nOff} $entry {}
|
||||||
|
if {$nIncr==2} {error "tl_print_if_entry needs improvement!"}
|
||||||
|
|
||||||
|
puts " else if( c>=$iFirst && c<[expr $iFirst+$nRange] )\{"
|
||||||
|
puts " ret = c + $nOff;"
|
||||||
|
puts " \}"
|
||||||
|
}
|
||||||
|
|
||||||
|
proc print_tolower {zFunc} {
|
||||||
|
|
||||||
|
set lRecord [tl_create_records]
|
||||||
|
|
||||||
|
set lHigh [list]
|
||||||
|
puts "int ${zFunc}\(int c)\{"
|
||||||
|
tl_print_table_header
|
||||||
|
foreach entry $lRecord {
|
||||||
|
if {[tl_print_table_entry toggle $entry]} {
|
||||||
|
lappend lHigh $entry
|
||||||
|
}
|
||||||
|
}
|
||||||
|
tl_print_table_footer toggle
|
||||||
|
puts {
|
||||||
|
int ret = c;
|
||||||
|
|
||||||
|
assert( c>=0 );
|
||||||
|
assert( sizeof(unsigned short)==2 && sizeof(unsigned char)==1 );
|
||||||
|
|
||||||
|
if( c<65536 ){
|
||||||
|
int iHi = sizeof(aEntry)/sizeof(aEntry[0]) - 1;
|
||||||
|
int iLo = 0;
|
||||||
|
int iRes = -1;
|
||||||
|
|
||||||
|
while( iHi>=iLo ){
|
||||||
|
int iTest = (iHi + iLo) / 2;
|
||||||
|
int cmp = (c - aEntry[iTest].iCode);
|
||||||
|
if( cmp>=0 ){
|
||||||
|
iRes = iTest;
|
||||||
|
iLo = iTest+1;
|
||||||
|
}else{
|
||||||
|
iHi = iTest-1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
assert( iRes<0 || c>=aEntry[iRes].iCode );
|
||||||
|
|
||||||
|
if( iRes>=0 ){
|
||||||
|
const struct TableEntry *p = &aEntry[iRes];
|
||||||
|
if( c<(p->iCode + p->nRange) && 0==(p->bFlag & (p->iCode ^ c)) ){
|
||||||
|
ret = (c + p->iOff) & 0x0000FFFF;
|
||||||
|
assert( ret>0 );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
foreach entry $lHigh {
|
||||||
|
tl_print_if_entry $entry
|
||||||
|
}
|
||||||
|
|
||||||
|
puts ""
|
||||||
|
puts " return ret;"
|
||||||
|
puts "\}"
|
||||||
|
}
|
||||||
|
|
||||||
|
proc print_tolower_test {zFunc} {
|
||||||
|
global tl_lookup_table
|
||||||
|
|
||||||
|
puts "static int tolower_test(int *piCode)\{"
|
||||||
|
puts -nonewline " static int aLookup\[\] = \{"
|
||||||
|
for {set i 0} {$i < 70000} {incr i} {
|
||||||
|
set expected $i
|
||||||
|
catch { set expected $tl_lookup_table($i) }
|
||||||
|
if {($i % 8)==0} { puts "" ; puts -nonewline " " }
|
||||||
|
puts -nonewline "$expected, "
|
||||||
|
}
|
||||||
|
puts " \};"
|
||||||
|
puts " int i;"
|
||||||
|
puts " for(i=0; i<sizeof(aLookup)/sizeof(aLookup\[0\]); i++)\{"
|
||||||
|
puts " if( ${zFunc}\(i)!=aLookup\[i\] )\{"
|
||||||
|
puts " *piCode = i;"
|
||||||
|
puts " return 1;"
|
||||||
|
puts " \}"
|
||||||
|
puts " \}"
|
||||||
|
puts " return 0;"
|
||||||
|
puts "\}"
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
proc print_fileheader {} {
|
||||||
|
puts [string trim {
|
||||||
|
/*
|
||||||
|
** DO NOT EDIT THIS MACHINE GENERATED FILE.
|
||||||
|
*/
|
||||||
|
}]
|
||||||
|
puts ""
|
||||||
|
puts "#include <assert.h>"
|
||||||
|
puts ""
|
||||||
|
}
|
||||||
|
|
||||||
|
proc print_test_main {} {
|
||||||
|
puts ""
|
||||||
|
puts "#include <stdio.h>"
|
||||||
|
puts ""
|
||||||
|
puts "int main(int argc, char **argv)\{"
|
||||||
|
puts " int r1, r2;"
|
||||||
|
puts " int code;"
|
||||||
|
puts " r1 = isalnum_test(&code);"
|
||||||
|
puts " if( r1 ) printf(\"isalnum(): Problem with code %d\\n\",code);"
|
||||||
|
puts " else printf(\"isalnum(): test passed\\n\");"
|
||||||
|
puts " r2 = tolower_test(&code);"
|
||||||
|
puts " if( r2 ) printf(\"tolower(): Problem with code %d\\n\",code);"
|
||||||
|
puts " else printf(\"tolower(): test passed\\n\");"
|
||||||
|
puts " return (r1 || r2);"
|
||||||
|
puts "\}"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Proces the command line arguments. Exit early if they are not to
|
||||||
|
# our liking.
|
||||||
|
#
|
||||||
|
proc usage {} {
|
||||||
|
puts -nonewline stderr "Usage: $::argv0 ?-test? i"
|
||||||
|
puts stderr "<CaseFolding.txt file> <UnicodeData.txt file>"
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
if {[llength $argv]!=2 && [llength $argv]!=3} usage
|
||||||
|
if {[llength $argv]==3 && [lindex $argv 0]!="-test"} usage
|
||||||
|
set unicodedata.txt [lindex $argv end]
|
||||||
|
set casefolding.txt [lindex $argv end-1]
|
||||||
|
set generate_test_code [expr {[llength $argv]==3}]
|
||||||
|
|
||||||
|
# Print the isalnum() function to stdout.
|
||||||
|
#
|
||||||
|
print_fileheader
|
||||||
|
set lRange [an_load_separator_ranges]
|
||||||
|
print_isalnum sqlite3FtsUnicodeIsalnum $lRange
|
||||||
|
|
||||||
|
# Leave a gap between the two generated C functions.
|
||||||
|
#
|
||||||
|
puts ""
|
||||||
|
puts ""
|
||||||
|
|
||||||
|
# Print the tolower() function to stdout.
|
||||||
|
#
|
||||||
|
tl_load_casefolding_txt ${casefolding.txt}
|
||||||
|
print_tolower sqlite3FtsUnicodeTolower
|
||||||
|
|
||||||
|
# Print the test routines and main() function to stdout, if -test
|
||||||
|
# was specified.
|
||||||
|
#
|
||||||
|
if {$::generate_test_code} {
|
||||||
|
print_test_isalnum sqlite3FtsUnicodeIsalnum $lRange
|
||||||
|
print_tolower_test sqlite3FtsUnicodeTolower
|
||||||
|
print_test_main
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
9
main.mk
9
main.mk
@ -55,6 +55,7 @@ LIBOBJ+= alter.o analyze.o attach.o auth.o \
|
|||||||
callback.o complete.o ctime.o date.o delete.o expr.o fault.o fkey.o \
|
callback.o complete.o ctime.o date.o delete.o expr.o fault.o fkey.o \
|
||||||
fts3.o fts3_aux.o fts3_expr.o fts3_hash.o fts3_icu.o fts3_porter.o \
|
fts3.o fts3_aux.o fts3_expr.o fts3_hash.o fts3_icu.o fts3_porter.o \
|
||||||
fts3_snippet.o fts3_tokenizer.o fts3_tokenizer1.o \
|
fts3_snippet.o fts3_tokenizer.o fts3_tokenizer1.o \
|
||||||
|
fts3_unicode.o fts3_unicode2.o \
|
||||||
fts3_write.o func.o global.o hash.o \
|
fts3_write.o func.o global.o hash.o \
|
||||||
icu.o insert.o journal.o legacy.o loadext.o \
|
icu.o insert.o journal.o legacy.o loadext.o \
|
||||||
main.o malloc.o mem0.o mem1.o mem2.o mem3.o mem5.o \
|
main.o malloc.o mem0.o mem1.o mem2.o mem3.o mem5.o \
|
||||||
@ -198,6 +199,8 @@ SRC += \
|
|||||||
$(TOP)/ext/fts3/fts3_tokenizer.h \
|
$(TOP)/ext/fts3/fts3_tokenizer.h \
|
||||||
$(TOP)/ext/fts3/fts3_tokenizer.c \
|
$(TOP)/ext/fts3/fts3_tokenizer.c \
|
||||||
$(TOP)/ext/fts3/fts3_tokenizer1.c \
|
$(TOP)/ext/fts3/fts3_tokenizer1.c \
|
||||||
|
$(TOP)/ext/fts3/fts3_unicode.c \
|
||||||
|
$(TOP)/ext/fts3/fts3_unicode2.c \
|
||||||
$(TOP)/ext/fts3/fts3_write.c
|
$(TOP)/ext/fts3/fts3_write.c
|
||||||
SRC += \
|
SRC += \
|
||||||
$(TOP)/ext/icu/sqliteicu.h \
|
$(TOP)/ext/icu/sqliteicu.h \
|
||||||
@ -508,6 +511,12 @@ fts3_tokenizer.o: $(TOP)/ext/fts3/fts3_tokenizer.c $(HDR) $(EXTHDR)
|
|||||||
fts3_tokenizer1.o: $(TOP)/ext/fts3/fts3_tokenizer1.c $(HDR) $(EXTHDR)
|
fts3_tokenizer1.o: $(TOP)/ext/fts3/fts3_tokenizer1.c $(HDR) $(EXTHDR)
|
||||||
$(TCCX) -DSQLITE_CORE -c $(TOP)/ext/fts3/fts3_tokenizer1.c
|
$(TCCX) -DSQLITE_CORE -c $(TOP)/ext/fts3/fts3_tokenizer1.c
|
||||||
|
|
||||||
|
fts3_unicode.o: $(TOP)/ext/fts3/fts3_unicode.c $(HDR) $(EXTHDR)
|
||||||
|
$(TCCX) -DSQLITE_CORE -c $(TOP)/ext/fts3/fts3_unicode.c
|
||||||
|
|
||||||
|
fts3_unicode2.o: $(TOP)/ext/fts3/fts3_unicode2.c $(HDR) $(EXTHDR)
|
||||||
|
$(TCCX) -DSQLITE_CORE -c $(TOP)/ext/fts3/fts3_unicode2.c
|
||||||
|
|
||||||
fts3_write.o: $(TOP)/ext/fts3/fts3_write.c $(HDR) $(EXTHDR)
|
fts3_write.o: $(TOP)/ext/fts3/fts3_write.c $(HDR) $(EXTHDR)
|
||||||
$(TCCX) -DSQLITE_CORE -c $(TOP)/ext/fts3/fts3_write.c
|
$(TCCX) -DSQLITE_CORE -c $(TOP)/ext/fts3/fts3_write.c
|
||||||
|
|
||||||
|
34
manifest
34
manifest
@ -1,5 +1,5 @@
|
|||||||
C Version\s3.7.12.1
|
C Add\san\sexperimental\stokenizer\sto\sfts4\s-\s"unicode".\sThis\stokenizer\sworks\sin\sthe\ssame\sway\sexcept\sthat\sit\sunderstands\sunicode\s"simple\scase\sfolding"\sand\srecognizes\sall\scharacters\snot\sclassified\sas\s"Letters"\sor\s"Numbers"\sby\sunicode\sas\stoken\sseparators.
|
||||||
D 2012-05-22T02:45:53.459
|
D 2012-05-25T17:50:19.893
|
||||||
F Makefile.arm-wince-mingw32ce-gcc d6df77f1f48d690bd73162294bbba7f59507c72f
|
F Makefile.arm-wince-mingw32ce-gcc d6df77f1f48d690bd73162294bbba7f59507c72f
|
||||||
F Makefile.in 2f37e468503dbe79d35c9f6dffcf3fae1ae9ec20
|
F Makefile.in 2f37e468503dbe79d35c9f6dffcf3fae1ae9ec20
|
||||||
F Makefile.linux-gcc 91d710bdc4998cb015f39edf3cb314ec4f4d7e23
|
F Makefile.linux-gcc 91d710bdc4998cb015f39edf3cb314ec4f4d7e23
|
||||||
@ -55,9 +55,9 @@ F ext/fts3/README.content fdc666a70d5257a64fee209f97cf89e0e6e32b51
|
|||||||
F ext/fts3/README.syntax a19711dc5458c20734b8e485e75fb1981ec2427a
|
F ext/fts3/README.syntax a19711dc5458c20734b8e485e75fb1981ec2427a
|
||||||
F ext/fts3/README.tokenizers 998756696647400de63d5ba60e9655036cb966e9
|
F ext/fts3/README.tokenizers 998756696647400de63d5ba60e9655036cb966e9
|
||||||
F ext/fts3/README.txt 8c18f41574404623b76917b9da66fcb0ab38328d
|
F ext/fts3/README.txt 8c18f41574404623b76917b9da66fcb0ab38328d
|
||||||
F ext/fts3/fts3.c a7adf6747d1fdd627ecd421c1709996741ca6693
|
F ext/fts3/fts3.c 81c77264290b88ed80b7ad23ced152193eefae8f
|
||||||
F ext/fts3/fts3.h 3a10a0af180d502cecc50df77b1b22df142817fe
|
F ext/fts3/fts3.h 3a10a0af180d502cecc50df77b1b22df142817fe
|
||||||
F ext/fts3/fts3Int.h aca752b99c15ee738f5bcf0910eafb9e4aeb1b97
|
F ext/fts3/fts3Int.h 7b163fa22e7a625c404c424f2779a4d7b14c14ad
|
||||||
F ext/fts3/fts3_aux.c 5205182bd8f372782597888156404766edf5781e
|
F ext/fts3/fts3_aux.c 5205182bd8f372782597888156404766edf5781e
|
||||||
F ext/fts3/fts3_expr.c dbc7ba4c3a6061adde0f38ed8e9b349568299551
|
F ext/fts3/fts3_expr.c dbc7ba4c3a6061adde0f38ed8e9b349568299551
|
||||||
F ext/fts3/fts3_hash.c 8dd2d06b66c72c628c2732555a32bc0943114914
|
F ext/fts3/fts3_hash.c 8dd2d06b66c72c628c2732555a32bc0943114914
|
||||||
@ -70,10 +70,15 @@ F ext/fts3/fts3_test.c 348f7d08cae05285794e23dc4fe8b8fdf66e264a
|
|||||||
F ext/fts3/fts3_tokenizer.c 3da7254a9881f7e270ab28e2004e0d22b3212bce
|
F ext/fts3/fts3_tokenizer.c 3da7254a9881f7e270ab28e2004e0d22b3212bce
|
||||||
F ext/fts3/fts3_tokenizer.h 66dec98e365854b6cd2d54f1a96bb6d428fc5a68
|
F ext/fts3/fts3_tokenizer.h 66dec98e365854b6cd2d54f1a96bb6d428fc5a68
|
||||||
F ext/fts3/fts3_tokenizer1.c 5c98225a53705e5ee34824087478cf477bdb7004
|
F ext/fts3/fts3_tokenizer1.c 5c98225a53705e5ee34824087478cf477bdb7004
|
||||||
|
F ext/fts3/fts3_unicode.c 033ee5d10d1a69613890d892829e6d3cf7177e40
|
||||||
|
F ext/fts3/fts3_unicode2.c 83ad4e6a2e5ef96d89d0822810be74748a91b94f
|
||||||
F ext/fts3/fts3_write.c cd4af00b3b0512b4d76177a267fcaafab44cbce4
|
F ext/fts3/fts3_write.c cd4af00b3b0512b4d76177a267fcaafab44cbce4
|
||||||
F ext/fts3/fts3speed.tcl b54caf6a18d38174f1a6e84219950d85e98bb1e9
|
F ext/fts3/fts3speed.tcl b54caf6a18d38174f1a6e84219950d85e98bb1e9
|
||||||
F ext/fts3/mkfts3amal.tcl 252ecb7fe6467854f2aa237bf2c390b74e71f100
|
F ext/fts3/mkfts3amal.tcl 252ecb7fe6467854f2aa237bf2c390b74e71f100
|
||||||
F ext/fts3/tool/fts3view.c 6cfc5b67a5f0e09c0d698f9fd012c784bfaa9197
|
F ext/fts3/tool/fts3view.c 6cfc5b67a5f0e09c0d698f9fd012c784bfaa9197
|
||||||
|
F ext/fts3/unicode/CaseFolding.txt 8c678ca52ecc95e16bc7afc2dbf6fc9ffa05db8c
|
||||||
|
F ext/fts3/unicode/UnicodeData.txt cd07314edb62d49fde34debdaf92fa2aa69011e7
|
||||||
|
F ext/fts3/unicode/mkunicode.tcl 1f50ed0021cb7415b3d24505512996037b2e5ec4
|
||||||
F ext/icu/README.txt bf8461d8cdc6b8f514c080e4e10dc3b2bbdfefa9
|
F ext/icu/README.txt bf8461d8cdc6b8f514c080e4e10dc3b2bbdfefa9
|
||||||
F ext/icu/icu.c eb9ae1d79046bd7871aa97ee6da51eb770134b5a
|
F ext/icu/icu.c eb9ae1d79046bd7871aa97ee6da51eb770134b5a
|
||||||
F ext/icu/sqliteicu.h 728867a802baa5a96de7495e9689a8e01715ef37
|
F ext/icu/sqliteicu.h 728867a802baa5a96de7495e9689a8e01715ef37
|
||||||
@ -98,7 +103,7 @@ F ext/rtree/tkt3363.test 142ab96eded44a3615ec79fba98c7bde7d0f96de
|
|||||||
F ext/rtree/viewrtree.tcl eea6224b3553599ae665b239bd827e182b466024
|
F ext/rtree/viewrtree.tcl eea6224b3553599ae665b239bd827e182b466024
|
||||||
F install-sh 9d4de14ab9fb0facae2f48780b874848cbf2f895 x
|
F install-sh 9d4de14ab9fb0facae2f48780b874848cbf2f895 x
|
||||||
F ltmain.sh 3ff0879076df340d2e23ae905484d8c15d5fdea8
|
F ltmain.sh 3ff0879076df340d2e23ae905484d8c15d5fdea8
|
||||||
F main.mk a80771d44176a0c744d9d4e048497e7ed0b4040d
|
F main.mk 84ed9c324cf0b8f4eb6f276553d1fd092b5ae0f4
|
||||||
F mkdll.sh 7d09b23c05d56532e9d44a50868eb4b12ff4f74a
|
F mkdll.sh 7d09b23c05d56532e9d44a50868eb4b12ff4f74a
|
||||||
F mkextu.sh 416f9b7089d80e5590a29692c9d9280a10dbad9f
|
F mkextu.sh 416f9b7089d80e5590a29692c9d9280a10dbad9f
|
||||||
F mkextw.sh 4123480947681d9b434a5e7b1ee08135abe409ac
|
F mkextw.sh 4123480947681d9b434a5e7b1ee08135abe409ac
|
||||||
@ -496,6 +501,7 @@ F test/fts4langid.test 24a6e41063b416bbdf371ff6b4476fa41c194aa7
|
|||||||
F test/fts4merge.test c424309743fdd203f8e56a1f1cd7872cd66cc0ee
|
F test/fts4merge.test c424309743fdd203f8e56a1f1cd7872cd66cc0ee
|
||||||
F test/fts4merge2.test 5faa558d1b672f82b847d2a337465fa745e46891
|
F test/fts4merge2.test 5faa558d1b672f82b847d2a337465fa745e46891
|
||||||
F test/fts4merge3.test aab02a09f50fe6baaddc2e159c3eabc116d45fc7
|
F test/fts4merge3.test aab02a09f50fe6baaddc2e159c3eabc116d45fc7
|
||||||
|
F test/fts4unicode.test 0627683f8ca06035d677d4becc2cd0dc57149ef7
|
||||||
F test/func.test 9809b7622d721904a8cc33c1ffb87f46d506ed01
|
F test/func.test 9809b7622d721904a8cc33c1ffb87f46d506ed01
|
||||||
F test/func2.test 772d66227e4e6684b86053302e2d74a2500e1e0f
|
F test/func2.test 772d66227e4e6684b86053302e2d74a2500e1e0f
|
||||||
F test/func3.test 001021e5b88bd02a3b365a5c5fd8f6f49d39744a
|
F test/func3.test 001021e5b88bd02a3b365a5c5fd8f6f49d39744a
|
||||||
@ -636,7 +642,7 @@ F test/pageropt.test 9191867ed19a2b3db6c42d1b36b6fbc657cd1ab0
|
|||||||
F test/pagesize.test 1dd51367e752e742f58e861e65ed7390603827a0
|
F test/pagesize.test 1dd51367e752e742f58e861e65ed7390603827a0
|
||||||
F test/pcache.test 065aa286e722ab24f2e51792c1f093bf60656b16
|
F test/pcache.test 065aa286e722ab24f2e51792c1f093bf60656b16
|
||||||
F test/pcache2.test a83efe2dec0d392f814bfc998def1d1833942025
|
F test/pcache2.test a83efe2dec0d392f814bfc998def1d1833942025
|
||||||
F test/permutations.test dbda172249564f43ec556108a704581044c57dbd
|
F test/permutations.test ea7b6948eaa22993fcfa662eb704ce29ddb24b2a
|
||||||
F test/pragma.test c51c148defe32bf4a419a522f95d26838d5cf677
|
F test/pragma.test c51c148defe32bf4a419a522f95d26838d5cf677
|
||||||
F test/pragma2.test 3a55f82b954242c642f8342b17dffc8b47472947
|
F test/pragma2.test 3a55f82b954242c642f8342b17dffc8b47472947
|
||||||
F test/printf.test ec9870c4dce8686a37818e0bf1aba6e6a1863552
|
F test/printf.test ec9870c4dce8686a37818e0bf1aba6e6a1863552
|
||||||
@ -972,7 +978,7 @@ F tool/mkkeywordhash.c bb52064aa614e1426445e4b2b9b00eeecd23cc79
|
|||||||
F tool/mkopts.tcl 66ac10d240cc6e86abd37dc908d50382f84ff46e
|
F tool/mkopts.tcl 66ac10d240cc6e86abd37dc908d50382f84ff46e
|
||||||
F tool/mkspeedsql.tcl a1a334d288f7adfe6e996f2e712becf076745c97
|
F tool/mkspeedsql.tcl a1a334d288f7adfe6e996f2e712becf076745c97
|
||||||
F tool/mksqlite3c-noext.tcl 105023aa86f696a74b1d6a4929d1e1c3baf9471c
|
F tool/mksqlite3c-noext.tcl 105023aa86f696a74b1d6a4929d1e1c3baf9471c
|
||||||
F tool/mksqlite3c.tcl 9fbac513cd9d5ac95ad55630f49bb16c5347ab75
|
F tool/mksqlite3c.tcl f289ba51f74f45c71a80c13e6c74a6dd92763253
|
||||||
F tool/mksqlite3h.tcl 78013ad79a5e492e5f764f3c7a8ef834255061f8
|
F tool/mksqlite3h.tcl 78013ad79a5e492e5f764f3c7a8ef834255061f8
|
||||||
F tool/mksqlite3internalh.tcl 7b43894e21bcb1bb39e11547ce7e38a063357e87
|
F tool/mksqlite3internalh.tcl 7b43894e21bcb1bb39e11547ce7e38a063357e87
|
||||||
F tool/offsets.c fe4262fdfa378e8f5499a42136d17bf3b98f6091
|
F tool/offsets.c fe4262fdfa378e8f5499a42136d17bf3b98f6091
|
||||||
@ -998,10 +1004,10 @@ F tool/tostr.awk e75472c2f98dd76e06b8c9c1367f4ab07e122d06
|
|||||||
F tool/vdbe-compress.tcl d70ea6d8a19e3571d7ab8c9b75cba86d1173ff0f
|
F tool/vdbe-compress.tcl d70ea6d8a19e3571d7ab8c9b75cba86d1173ff0f
|
||||||
F tool/warnings-clang.sh a8a0a3babda96dfb1ff51adda3cbbf3dfb7266c2
|
F tool/warnings-clang.sh a8a0a3babda96dfb1ff51adda3cbbf3dfb7266c2
|
||||||
F tool/warnings.sh fbc018d67fd7395f440c28f33ef0f94420226381
|
F tool/warnings.sh fbc018d67fd7395f440c28f33ef0f94420226381
|
||||||
P 5519cc5ef471e32a59995a34be811b46478dca1e
|
P 6d326d44fd1d626aae0e8456e5fa2049f1ce0789
|
||||||
R f9f0e59804fa7e60869ace7ace22bd3d
|
R cf548df9de4c764e0b1e09fe3f006b01
|
||||||
T +bgcolor * #d0c0ff
|
T *branch * fts4-unicode
|
||||||
T +sym-release *
|
T *sym-fts4-unicode *
|
||||||
T +sym-version-3.7.12.1 *
|
T -sym-trunk *
|
||||||
U drh
|
U dan
|
||||||
Z a30e57621f706278c1c64a3624297cbb
|
Z 9e1fccc2ef84e29acb91c1b6fba69301
|
||||||
|
@ -1 +1 @@
|
|||||||
6d326d44fd1d626aae0e8456e5fa2049f1ce0789
|
0c13570ec78c6887103dc99b81b470829fa28385
|
44
test/fts4unicode.test
Normal file
44
test/fts4unicode.test
Normal file
@ -0,0 +1,44 @@
|
|||||||
|
# 2012 May 25
|
||||||
|
#
|
||||||
|
# The author disclaims copyright to this source code. In place of
|
||||||
|
# a legal notice, here is a blessing:
|
||||||
|
#
|
||||||
|
# May you do good and not evil.
|
||||||
|
# May you find forgiveness for yourself and forgive others.
|
||||||
|
# May you share freely, never taking more than you give.
|
||||||
|
#
|
||||||
|
#*************************************************************************
|
||||||
|
#
|
||||||
|
# The tests in this file focus on testing the "unicode" FTS tokenizer.
|
||||||
|
#
|
||||||
|
|
||||||
|
set testdir [file dirname $argv0]
|
||||||
|
source $testdir/tester.tcl
|
||||||
|
ifcapable !fts3 { finish_test ; return }
|
||||||
|
set ::testprefix fts4unicode
|
||||||
|
|
||||||
|
proc do_unicode_token_test {tn input res} {
|
||||||
|
set input [string map {' ''} $input]
|
||||||
|
uplevel [list do_execsql_test $tn "
|
||||||
|
SELECT fts3_tokenizer_test('unicode', '$input');
|
||||||
|
" [list [list {*}$res]]]
|
||||||
|
}
|
||||||
|
|
||||||
|
do_unicode_token_test 1.0 {a B c D} {0 a a 1 b B 2 c c 3 d D}
|
||||||
|
do_unicode_token_test 1.1 {<7B> <20> <20>} {0 <20> <20> 1 <20> <20> 2 <20> <20>}
|
||||||
|
do_unicode_token_test 1.2 {x<>x x<>x x<>x} {0 x<>x x<>x 1 x<>x x<>x 2 x<>x x<>x}
|
||||||
|
|
||||||
|
# 0x00DF is a small "sharp s". 0x1E9E is a capital sharp s.
|
||||||
|
do_unicode_token_test 1.3 "\uDF" "0 \uDF \uDF"
|
||||||
|
do_unicode_token_test 1.4 "\u1E9E" "0 <20> \u1E9E"
|
||||||
|
do_unicode_token_test 1.5 "\u1E9E" "0 \uDF \u1E9E"
|
||||||
|
|
||||||
|
do_unicode_token_test 1.6 "The quick brown fox" {
|
||||||
|
0 the The 1 quick quick 2 brown brown 3 fox fox
|
||||||
|
}
|
||||||
|
do_unicode_token_test 1.7 "The\u00bfquick\u224ebrown\u2263fox" {
|
||||||
|
0 the The 1 quick quick 2 brown brown 3 fox fox
|
||||||
|
}
|
||||||
|
|
||||||
|
finish_test
|
||||||
|
|
@ -185,7 +185,7 @@ test_suite "fts3" -prefix "" -description {
|
|||||||
fts4aa.test fts4content.test
|
fts4aa.test fts4content.test
|
||||||
fts3conf.test fts3prefix.test fts3fault2.test fts3corrupt.test
|
fts3conf.test fts3prefix.test fts3fault2.test fts3corrupt.test
|
||||||
fts3corrupt2.test fts3first.test fts4langid.test fts4merge.test
|
fts3corrupt2.test fts3first.test fts4langid.test fts4merge.test
|
||||||
fts4check.test
|
fts4check.test fts4unicode.test
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -316,6 +316,8 @@ foreach file {
|
|||||||
fts3_tokenizer1.c
|
fts3_tokenizer1.c
|
||||||
fts3_write.c
|
fts3_write.c
|
||||||
fts3_snippet.c
|
fts3_snippet.c
|
||||||
|
fts3_unicode.c
|
||||||
|
fts3_unicode2.c
|
||||||
|
|
||||||
rtree.c
|
rtree.c
|
||||||
icu.c
|
icu.c
|
||||||
|
Reference in New Issue
Block a user