mirror of
https://github.com/sqlite/sqlite.git
synced 2025-07-29 08:01:23 +03:00
Merge the unicode61 tokenizer and the shared-cache-memory database changes
into the sessions branch. FossilOrigin-Name: df817e70afc3f41e680d8f84dfa5772d5b3ae4d9
This commit is contained in:
@ -11,7 +11,7 @@
|
||||
);
|
||||
|
||||
The built-in tokenizers (valid values to pass as <tokenizer name>) are
|
||||
"simple" and "porter".
|
||||
"simple", "porter" and "unicode".
|
||||
|
||||
<tokenizer-args> should consist of zero or more white-space separated
|
||||
arguments to pass to the selected tokenizer implementation. The
|
||||
|
@ -3554,6 +3554,9 @@ static void hashDestroy(void *p){
|
||||
*/
|
||||
void sqlite3Fts3SimpleTokenizerModule(sqlite3_tokenizer_module const**ppModule);
|
||||
void sqlite3Fts3PorterTokenizerModule(sqlite3_tokenizer_module const**ppModule);
|
||||
#ifndef SQLITE_DISABLE_FTS3_UNICODE
|
||||
void sqlite3Fts3UnicodeTokenizer(sqlite3_tokenizer_module const**ppModule);
|
||||
#endif
|
||||
#ifdef SQLITE_ENABLE_ICU
|
||||
void sqlite3Fts3IcuTokenizerModule(sqlite3_tokenizer_module const**ppModule);
|
||||
#endif
|
||||
@ -3569,12 +3572,19 @@ int sqlite3Fts3Init(sqlite3 *db){
|
||||
Fts3Hash *pHash = 0;
|
||||
const sqlite3_tokenizer_module *pSimple = 0;
|
||||
const sqlite3_tokenizer_module *pPorter = 0;
|
||||
#ifndef SQLITE_DISABLE_FTS3_UNICODE
|
||||
const sqlite3_tokenizer_module *pUnicode = 0;
|
||||
#endif
|
||||
|
||||
#ifdef SQLITE_ENABLE_ICU
|
||||
const sqlite3_tokenizer_module *pIcu = 0;
|
||||
sqlite3Fts3IcuTokenizerModule(&pIcu);
|
||||
#endif
|
||||
|
||||
#ifndef SQLITE_DISABLE_FTS3_UNICODE
|
||||
sqlite3Fts3UnicodeTokenizer(&pUnicode);
|
||||
#endif
|
||||
|
||||
#ifdef SQLITE_TEST
|
||||
rc = sqlite3Fts3InitTerm(db);
|
||||
if( rc!=SQLITE_OK ) return rc;
|
||||
@ -3598,6 +3608,10 @@ int sqlite3Fts3Init(sqlite3 *db){
|
||||
if( rc==SQLITE_OK ){
|
||||
if( sqlite3Fts3HashInsert(pHash, "simple", 7, (void *)pSimple)
|
||||
|| sqlite3Fts3HashInsert(pHash, "porter", 7, (void *)pPorter)
|
||||
|
||||
#ifndef SQLITE_DISABLE_FTS3_UNICODE
|
||||
|| sqlite3Fts3HashInsert(pHash, "unicode61", 10, (void *)pUnicode)
|
||||
#endif
|
||||
#ifdef SQLITE_ENABLE_ICU
|
||||
|| (pIcu && sqlite3Fts3HashInsert(pHash, "icu", 4, (void *)pIcu))
|
||||
#endif
|
||||
|
@ -541,5 +541,9 @@ int sqlite3Fts3MsrIncrRestart(Fts3MultiSegReader *pCsr);
|
||||
|
||||
int sqlite3Fts3DeferredTokenList(Fts3DeferredToken *, char **, int *);
|
||||
|
||||
/* fts3_unicode2.c (functions generated by parsing unicode text files) */
|
||||
int sqlite3FtsUnicodeTolower(int);
|
||||
int sqlite3FtsUnicodeIsalnum(int);
|
||||
|
||||
#endif /* !SQLITE_CORE || SQLITE_ENABLE_FTS3 */
|
||||
#endif /* _FTSINT_H */
|
||||
|
246
ext/fts3/fts3_unicode.c
Normal file
246
ext/fts3/fts3_unicode.c
Normal file
@ -0,0 +1,246 @@
|
||||
/*
|
||||
** 2012 May 24
|
||||
**
|
||||
** The author disclaims copyright to this source code. In place of
|
||||
** a legal notice, here is a blessing:
|
||||
**
|
||||
** May you do good and not evil.
|
||||
** May you find forgiveness for yourself and forgive others.
|
||||
** May you share freely, never taking more than you give.
|
||||
**
|
||||
******************************************************************************
|
||||
**
|
||||
** Implementation of the "unicode" full-text-search tokenizer.
|
||||
*/
|
||||
|
||||
#ifndef SQLITE_DISABLE_FTS3_UNICODE
|
||||
|
||||
#include "fts3Int.h"
|
||||
#if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3)
|
||||
|
||||
#include <assert.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "fts3_tokenizer.h"
|
||||
|
||||
/*
|
||||
** The following two macros - READ_UTF8 and WRITE_UTF8 - have been copied
|
||||
** from the sqlite3 source file utf.c. If this file is compiled as part
|
||||
** of the amalgamation, they are not required.
|
||||
*/
|
||||
#ifndef SQLITE_AMALGAMATION
|
||||
|
||||
static const unsigned char sqlite3Utf8Trans1[] = {
|
||||
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
|
||||
0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
|
||||
0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
|
||||
0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
|
||||
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
|
||||
0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
|
||||
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
|
||||
0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x00, 0x00,
|
||||
};
|
||||
|
||||
#define READ_UTF8(zIn, zTerm, c) \
|
||||
c = *(zIn++); \
|
||||
if( c>=0xc0 ){ \
|
||||
c = sqlite3Utf8Trans1[c-0xc0]; \
|
||||
while( zIn!=zTerm && (*zIn & 0xc0)==0x80 ){ \
|
||||
c = (c<<6) + (0x3f & *(zIn++)); \
|
||||
} \
|
||||
if( c<0x80 \
|
||||
|| (c&0xFFFFF800)==0xD800 \
|
||||
|| (c&0xFFFFFFFE)==0xFFFE ){ c = 0xFFFD; } \
|
||||
}
|
||||
|
||||
#define WRITE_UTF8(zOut, c) { \
|
||||
if( c<0x00080 ){ \
|
||||
*zOut++ = (u8)(c&0xFF); \
|
||||
} \
|
||||
else if( c<0x00800 ){ \
|
||||
*zOut++ = 0xC0 + (u8)((c>>6)&0x1F); \
|
||||
*zOut++ = 0x80 + (u8)(c & 0x3F); \
|
||||
} \
|
||||
else if( c<0x10000 ){ \
|
||||
*zOut++ = 0xE0 + (u8)((c>>12)&0x0F); \
|
||||
*zOut++ = 0x80 + (u8)((c>>6) & 0x3F); \
|
||||
*zOut++ = 0x80 + (u8)(c & 0x3F); \
|
||||
}else{ \
|
||||
*zOut++ = 0xF0 + (u8)((c>>18) & 0x07); \
|
||||
*zOut++ = 0x80 + (u8)((c>>12) & 0x3F); \
|
||||
*zOut++ = 0x80 + (u8)((c>>6) & 0x3F); \
|
||||
*zOut++ = 0x80 + (u8)(c & 0x3F); \
|
||||
} \
|
||||
}
|
||||
|
||||
#endif /* ifndef SQLITE_AMALGAMATION */
|
||||
|
||||
typedef struct unicode_tokenizer unicode_tokenizer;
|
||||
typedef struct unicode_cursor unicode_cursor;
|
||||
|
||||
struct unicode_tokenizer {
|
||||
sqlite3_tokenizer base;
|
||||
};
|
||||
|
||||
struct unicode_cursor {
|
||||
sqlite3_tokenizer_cursor base;
|
||||
const unsigned char *aInput; /* Input text being tokenized */
|
||||
int nInput; /* Size of aInput[] in bytes */
|
||||
int iOff; /* Current offset within aInput[] */
|
||||
int iToken; /* Index of next token to be returned */
|
||||
char *zToken; /* storage for current token */
|
||||
int nAlloc; /* space allocated at zToken */
|
||||
};
|
||||
|
||||
/*
|
||||
** Create a new tokenizer instance.
|
||||
*/
|
||||
static int unicodeCreate(
|
||||
int nArg, /* Size of array argv[] */
|
||||
const char * const *azArg, /* Tokenizer creation arguments */
|
||||
sqlite3_tokenizer **pp /* OUT: New tokenizer handle */
|
||||
){
|
||||
unicode_tokenizer *pNew; /* New tokenizer object */
|
||||
pNew = (unicode_tokenizer *) sqlite3_malloc(sizeof(unicode_tokenizer));
|
||||
if( pNew==NULL ){
|
||||
return SQLITE_NOMEM;
|
||||
}
|
||||
memset(pNew, 0, sizeof(unicode_tokenizer));
|
||||
*pp = &pNew->base;
|
||||
return SQLITE_OK;
|
||||
}
|
||||
|
||||
/*
|
||||
** Destroy a tokenizer allocated by unicodeCreate().
|
||||
*/
|
||||
static int unicodeDestroy(sqlite3_tokenizer *pTokenizer){
|
||||
sqlite3_free(pTokenizer);
|
||||
return SQLITE_OK;
|
||||
}
|
||||
|
||||
/*
|
||||
** Prepare to begin tokenizing a particular string. The input
|
||||
** string to be tokenized is pInput[0..nBytes-1]. A cursor
|
||||
** used to incrementally tokenize this string is returned in
|
||||
** *ppCursor.
|
||||
*/
|
||||
static int unicodeOpen(
|
||||
sqlite3_tokenizer *p, /* The tokenizer */
|
||||
const char *aInput, /* Input string */
|
||||
int nInput, /* Size of string aInput in bytes */
|
||||
sqlite3_tokenizer_cursor **pp /* OUT: New cursor object */
|
||||
){
|
||||
unicode_cursor *pCsr;
|
||||
|
||||
pCsr = (unicode_cursor *)sqlite3_malloc(sizeof(unicode_cursor));
|
||||
if( pCsr==0 ){
|
||||
return SQLITE_NOMEM;
|
||||
}
|
||||
memset(pCsr, 0, sizeof(unicode_cursor));
|
||||
|
||||
pCsr->aInput = (const unsigned char *)aInput;
|
||||
if( aInput==0 ){
|
||||
pCsr->nInput = 0;
|
||||
}else if( nInput<0 ){
|
||||
pCsr->nInput = (int)strlen(aInput);
|
||||
}else{
|
||||
pCsr->nInput = nInput;
|
||||
}
|
||||
|
||||
*pp = &pCsr->base;
|
||||
UNUSED_PARAMETER(p);
|
||||
return SQLITE_OK;
|
||||
}
|
||||
|
||||
/*
|
||||
** Close a tokenization cursor previously opened by a call to
|
||||
** simpleOpen() above.
|
||||
*/
|
||||
static int unicodeClose(sqlite3_tokenizer_cursor *pCursor){
|
||||
unicode_cursor *pCsr = (unicode_cursor *) pCursor;
|
||||
sqlite3_free(pCsr->zToken);
|
||||
sqlite3_free(pCsr);
|
||||
return SQLITE_OK;
|
||||
}
|
||||
|
||||
/*
|
||||
** Extract the next token from a tokenization cursor. The cursor must
|
||||
** have been opened by a prior call to simpleOpen().
|
||||
*/
|
||||
static int unicodeNext(
|
||||
sqlite3_tokenizer_cursor *p, /* Cursor returned by simpleOpen */
|
||||
const char **paToken, /* OUT: Token text */
|
||||
int *pnToken, /* OUT: Number of bytes at *paToken */
|
||||
int *piStart, /* OUT: Starting offset of token */
|
||||
int *piEnd, /* OUT: Ending offset of token */
|
||||
int *piPos /* OUT: Position integer of token */
|
||||
){
|
||||
unicode_cursor *pCsr = (unicode_cursor *)p;
|
||||
int iCode;
|
||||
char *zOut;
|
||||
const unsigned char *z = &pCsr->aInput[pCsr->iOff];
|
||||
const unsigned char *zStart = z;
|
||||
const unsigned char *zEnd;
|
||||
const unsigned char *zTerm = &pCsr->aInput[pCsr->nInput];
|
||||
|
||||
/* Scan past any delimiter characters before the start of the next token.
|
||||
** Return SQLITE_DONE early if this takes us all the way to the end of
|
||||
** the input. */
|
||||
while( z<zTerm ){
|
||||
READ_UTF8(z, zTerm, iCode);
|
||||
if( sqlite3FtsUnicodeIsalnum(iCode) ) break;
|
||||
zStart = z;
|
||||
}
|
||||
if( zStart>=zTerm ) return SQLITE_DONE;
|
||||
|
||||
zOut = pCsr->zToken;
|
||||
do {
|
||||
/* Grow the output buffer if required. */
|
||||
if( (zOut-pCsr->zToken)>=(pCsr->nAlloc-4) ){
|
||||
char *zNew = sqlite3_realloc(pCsr->zToken, pCsr->nAlloc+64);
|
||||
if( !zNew ) return SQLITE_NOMEM;
|
||||
zOut = &zNew[zOut - pCsr->zToken];
|
||||
pCsr->zToken = zNew;
|
||||
pCsr->nAlloc += 64;
|
||||
}
|
||||
|
||||
/* Write the folded case of the last character read to the output */
|
||||
zEnd = z;
|
||||
WRITE_UTF8(zOut, sqlite3FtsUnicodeTolower(iCode));
|
||||
|
||||
/* If the cursor is not at EOF, read the next character */
|
||||
if( z>=zTerm ) break;
|
||||
READ_UTF8(z, zTerm, iCode);
|
||||
}while( sqlite3FtsUnicodeIsalnum(iCode) );
|
||||
|
||||
/* Set the output variables and return. */
|
||||
pCsr->iOff = (z - pCsr->aInput);
|
||||
*paToken = pCsr->zToken;
|
||||
*pnToken = zOut - pCsr->zToken;
|
||||
*piStart = (zStart - pCsr->aInput);
|
||||
*piEnd = (zEnd - pCsr->aInput);
|
||||
*piPos = pCsr->iToken++;
|
||||
return SQLITE_OK;
|
||||
}
|
||||
|
||||
/*
|
||||
** Set *ppModule to a pointer to the sqlite3_tokenizer_module
|
||||
** structure for the unicode tokenizer.
|
||||
*/
|
||||
void sqlite3Fts3UnicodeTokenizer(sqlite3_tokenizer_module const **ppModule){
|
||||
static const sqlite3_tokenizer_module module = {
|
||||
0,
|
||||
unicodeCreate,
|
||||
unicodeDestroy,
|
||||
unicodeOpen,
|
||||
unicodeClose,
|
||||
unicodeNext,
|
||||
0,
|
||||
};
|
||||
*ppModule = &module;
|
||||
}
|
||||
|
||||
#endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3) */
|
||||
#endif /* ifndef SQLITE_DISABLE_FTS3_UNICODE */
|
296
ext/fts3/fts3_unicode2.c
Normal file
296
ext/fts3/fts3_unicode2.c
Normal file
@ -0,0 +1,296 @@
|
||||
/*
|
||||
** 2012 May 25
|
||||
**
|
||||
** The author disclaims copyright to this source code. In place of
|
||||
** a legal notice, here is a blessing:
|
||||
**
|
||||
** May you do good and not evil.
|
||||
** May you find forgiveness for yourself and forgive others.
|
||||
** May you share freely, never taking more than you give.
|
||||
**
|
||||
******************************************************************************
|
||||
*/
|
||||
|
||||
/*
|
||||
** DO NOT EDIT THIS MACHINE GENERATED FILE.
|
||||
*/
|
||||
|
||||
#if !defined(SQLITE_DISABLE_FTS3_UNICODE)
|
||||
#if defined(SQLITE_ENABLE_FTS3) || defined(SQLITE_ENABLE_FTS4)
|
||||
|
||||
#include <assert.h>
|
||||
|
||||
/*
|
||||
** Return true if the argument corresponds to a unicode codepoint
|
||||
** classified as either a letter or a number. Otherwise false.
|
||||
**
|
||||
** The results are undefined if the value passed to this function
|
||||
** is less than zero.
|
||||
*/
|
||||
int sqlite3FtsUnicodeIsalnum(int c){
|
||||
/* Each unsigned integer in the following array corresponds to a contiguous
|
||||
** range of unicode codepoints that are not either letters or numbers (i.e.
|
||||
** codepoints for which this function should return 0).
|
||||
**
|
||||
** The most significant 22 bits in each 32-bit value contain the first
|
||||
** codepoint in the range. The least significant 10 bits are used to store
|
||||
** the size of the range (always at least 1). In other words, the value
|
||||
** ((C<<22) + N) represents a range of N codepoints starting with codepoint
|
||||
** C. It is not possible to represent a range larger than 1023 codepoints
|
||||
** using this format.
|
||||
*/
|
||||
const static unsigned int aEntry[] = {
|
||||
0x00000030, 0x0000E807, 0x00016C06, 0x0001EC2F, 0x0002AC07,
|
||||
0x0002D001, 0x0002D803, 0x0002EC01, 0x0002FC01, 0x00035C01,
|
||||
0x0003DC01, 0x000B0804, 0x000B480E, 0x000B9407, 0x000BB401,
|
||||
0x000BBC81, 0x000DD401, 0x000DF801, 0x000E1002, 0x000E1C01,
|
||||
0x000FD801, 0x00120808, 0x00156806, 0x00162402, 0x00163C01,
|
||||
0x00164437, 0x0017CC02, 0x00180005, 0x00181816, 0x00187802,
|
||||
0x00192C15, 0x0019A804, 0x0019C001, 0x001B5001, 0x001B580F,
|
||||
0x001B9C07, 0x001BF402, 0x001C000E, 0x001C3C01, 0x001C4401,
|
||||
0x001CC01B, 0x001E980B, 0x001FAC09, 0x001FD804, 0x00205804,
|
||||
0x00206C09, 0x00209403, 0x0020A405, 0x0020C00F, 0x00216403,
|
||||
0x00217801, 0x0023901B, 0x00240004, 0x0024E803, 0x0024F812,
|
||||
0x00254407, 0x00258804, 0x0025C001, 0x00260403, 0x0026F001,
|
||||
0x0026F807, 0x00271C02, 0x00272C03, 0x00275C01, 0x00278802,
|
||||
0x0027C802, 0x0027E802, 0x00280403, 0x0028F001, 0x0028F805,
|
||||
0x00291C02, 0x00292C03, 0x00294401, 0x0029C002, 0x0029D401,
|
||||
0x002A0403, 0x002AF001, 0x002AF808, 0x002B1C03, 0x002B2C03,
|
||||
0x002B8802, 0x002BC002, 0x002C0403, 0x002CF001, 0x002CF807,
|
||||
0x002D1C02, 0x002D2C03, 0x002D5802, 0x002D8802, 0x002DC001,
|
||||
0x002E0801, 0x002EF805, 0x002F1803, 0x002F2804, 0x002F5C01,
|
||||
0x002FCC08, 0x00300403, 0x0030F807, 0x00311803, 0x00312804,
|
||||
0x00315402, 0x00318802, 0x0031FC01, 0x00320802, 0x0032F001,
|
||||
0x0032F807, 0x00331803, 0x00332804, 0x00335402, 0x00338802,
|
||||
0x00340802, 0x0034F807, 0x00351803, 0x00352804, 0x00355C01,
|
||||
0x00358802, 0x0035E401, 0x00360802, 0x00372801, 0x00373C06,
|
||||
0x00375801, 0x00376008, 0x0037C803, 0x0038C401, 0x0038D007,
|
||||
0x0038FC01, 0x00391C09, 0x00396802, 0x003AC401, 0x003AD006,
|
||||
0x003AEC02, 0x003B2006, 0x003C041F, 0x003CD00C, 0x003DC417,
|
||||
0x003E340B, 0x003E6424, 0x003EF80F, 0x003F380D, 0x0040AC14,
|
||||
0x00412806, 0x00415804, 0x00417803, 0x00418803, 0x00419C07,
|
||||
0x0041C404, 0x0042080C, 0x00423C01, 0x00426806, 0x0043EC01,
|
||||
0x004D740C, 0x004E400A, 0x00500001, 0x0059B402, 0x005A0001,
|
||||
0x005A6C02, 0x005BAC03, 0x005C4803, 0x005CC805, 0x005D4802,
|
||||
0x005DC802, 0x005ED023, 0x005F6004, 0x005F7401, 0x0060000F,
|
||||
0x0062A401, 0x0064800C, 0x0064C00C, 0x00650001, 0x00651002,
|
||||
0x0066C011, 0x00672002, 0x00677822, 0x00685C05, 0x00687802,
|
||||
0x0069540A, 0x0069801D, 0x0069FC01, 0x006A8007, 0x006AA006,
|
||||
0x006C0005, 0x006CD011, 0x006D6823, 0x006E0003, 0x006E840D,
|
||||
0x006F980E, 0x006FF004, 0x00709014, 0x0070EC05, 0x0071F802,
|
||||
0x00730008, 0x00734019, 0x0073B401, 0x0073C803, 0x00770027,
|
||||
0x0077F004, 0x007EF401, 0x007EFC03, 0x007F3403, 0x007F7403,
|
||||
0x007FB403, 0x007FF402, 0x00800065, 0x0081A806, 0x0081E805,
|
||||
0x00822805, 0x0082801A, 0x00834021, 0x00840002, 0x00840C04,
|
||||
0x00842002, 0x00845001, 0x00845803, 0x00847806, 0x00849401,
|
||||
0x00849C01, 0x0084A401, 0x0084B801, 0x0084E802, 0x00850005,
|
||||
0x00852804, 0x00853C01, 0x00864264, 0x00900027, 0x0091000B,
|
||||
0x0092704E, 0x00940200, 0x009C0475, 0x009E53B9, 0x00AD400A,
|
||||
0x00B39406, 0x00B3BC03, 0x00B3E404, 0x00B3F802, 0x00B5C001,
|
||||
0x00B5FC01, 0x00B7804F, 0x00B8C00C, 0x00BA001A, 0x00BA6C59,
|
||||
0x00BC00D6, 0x00BFC00C, 0x00C00005, 0x00C02019, 0x00C0A807,
|
||||
0x00C0D802, 0x00C0F403, 0x00C26404, 0x00C28001, 0x00C3EC01,
|
||||
0x00C64002, 0x00C6580A, 0x00C70024, 0x00C8001F, 0x00C8A81E,
|
||||
0x00C94001, 0x00C98020, 0x00CA2827, 0x00CB003F, 0x00CC0100,
|
||||
0x01370040, 0x02924037, 0x0293F802, 0x02983403, 0x0299BC10,
|
||||
0x029A7C01, 0x029BC008, 0x029C0017, 0x029C8002, 0x029E2402,
|
||||
0x02A00801, 0x02A01801, 0x02A02C01, 0x02A08C09, 0x02A0D804,
|
||||
0x02A1D004, 0x02A20002, 0x02A2D011, 0x02A33802, 0x02A38012,
|
||||
0x02A3E003, 0x02A4980A, 0x02A51C0D, 0x02A57C01, 0x02A60004,
|
||||
0x02A6CC1B, 0x02A77802, 0x02A8A40E, 0x02A90C01, 0x02A93002,
|
||||
0x02A97004, 0x02A9DC03, 0x02A9EC01, 0x02AAC001, 0x02AAC803,
|
||||
0x02AADC02, 0x02AAF802, 0x02AB0401, 0x02AB7802, 0x02ABAC07,
|
||||
0x02ABD402, 0x02AF8C0B, 0x03600001, 0x036DFC02, 0x036FFC02,
|
||||
0x037FFC02, 0x03E3FC01, 0x03EC7801, 0x03ECA401, 0x03EEC810,
|
||||
0x03F4F802, 0x03F7F002, 0x03F8001A, 0x03F88007, 0x03F8C023,
|
||||
0x03F95013, 0x03F9A004, 0x03FBFC01, 0x03FC040F, 0x03FC6807,
|
||||
0x03FCEC06, 0x03FD6C0B, 0x03FF8007, 0x03FFA007, 0x03FFE405,
|
||||
0x04040003, 0x0404DC09, 0x0405E411, 0x0406400C, 0x0407402E,
|
||||
0x040E7C01, 0x040F4001, 0x04215C01, 0x04247C01, 0x0424FC01,
|
||||
0x04280403, 0x04281402, 0x04283004, 0x0428E003, 0x0428FC01,
|
||||
0x04294009, 0x0429FC01, 0x042CE407, 0x04400003, 0x0440E016,
|
||||
0x04420003, 0x0442C012, 0x04440003, 0x04449C0E, 0x04450004,
|
||||
0x04460003, 0x0446CC0E, 0x04471404, 0x045AAC0D, 0x0491C004,
|
||||
0x05BD442E, 0x05BE3C04, 0x074000F6, 0x07440027, 0x0744A4B5,
|
||||
0x07480046, 0x074C0057, 0x075B0401, 0x075B6C01, 0x075BEC01,
|
||||
0x075C5401, 0x075CD401, 0x075D3C01, 0x075DBC01, 0x075E2401,
|
||||
0x075EA401, 0x075F0C01, 0x07BBC002, 0x07C0002C, 0x07C0C064,
|
||||
0x07C2800F, 0x07C2C40E, 0x07C3040F, 0x07C3440F, 0x07C4401F,
|
||||
0x07C4C03C, 0x07C5C02B, 0x07C7981D, 0x07C8402B, 0x07C90009,
|
||||
0x07C94002, 0x07CC0021, 0x07CCC006, 0x07CCDC46, 0x07CE0014,
|
||||
0x07CE8025, 0x07CF1805, 0x07CF8011, 0x07D0003F, 0x07D10001,
|
||||
0x07D108B6, 0x07D3E404, 0x07D4003E, 0x07D50004, 0x07D54018,
|
||||
0x07D7EC46, 0x07D9140B, 0x07DA0046, 0x07DC0074, 0x38000401,
|
||||
0x38008060, 0x380400F0, 0x3C000001, 0x3FFFF401, 0x40000001,
|
||||
0x43FFF401,
|
||||
};
|
||||
static const unsigned int aAscii[4] = {
|
||||
0xFFFFFFFF, 0xFC00FFFF, 0xF8000001, 0xF8000001,
|
||||
};
|
||||
|
||||
if( c<128 ){
|
||||
return ( (aAscii[c >> 5] & (1 << (c & 0x001F)))==0 );
|
||||
}else if( c<(1<<22) ){
|
||||
unsigned int key = (((unsigned int)c)<<10) | 0x000003FF;
|
||||
int iRes;
|
||||
int iHi = sizeof(aEntry)/sizeof(aEntry[0]) - 1;
|
||||
int iLo = 0;
|
||||
while( iHi>=iLo ){
|
||||
int iTest = (iHi + iLo) / 2;
|
||||
if( key >= aEntry[iTest] ){
|
||||
iRes = iTest;
|
||||
iLo = iTest+1;
|
||||
}else{
|
||||
iHi = iTest-1;
|
||||
}
|
||||
}
|
||||
assert( aEntry[0]<key );
|
||||
assert( key>=aEntry[iRes] );
|
||||
return (c >= ((aEntry[iRes]>>10) + (aEntry[iRes]&0x3FF)));
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
** Interpret the argument as a unicode codepoint. If the codepoint
|
||||
** is an upper case character that has a lower case equivalent,
|
||||
** return the codepoint corresponding to the lower case version.
|
||||
** Otherwise, return a copy of the argument.
|
||||
**
|
||||
** The results are undefined if the value passed to this function
|
||||
** is less than zero.
|
||||
*/
|
||||
int sqlite3FtsUnicodeTolower(int c){
|
||||
/* Each entry in the following array defines a rule for folding a range
|
||||
** of codepoints to lower case. The rule applies to a range of nRange
|
||||
** codepoints starting at codepoint iCode.
|
||||
**
|
||||
** If the least significant bit in flags is clear, then the rule applies
|
||||
** to all nRange codepoints (i.e. all nRange codepoints are upper case and
|
||||
** need to be folded). Or, if it is set, then the rule only applies to
|
||||
** every second codepoint in the range, starting with codepoint C.
|
||||
**
|
||||
** The 7 most significant bits in flags are an index into the aiOff[]
|
||||
** array. If a specific codepoint C does require folding, then its lower
|
||||
** case equivalent is ((C + aiOff[flags>>1]) & 0xFFFF).
|
||||
**
|
||||
** The contents of this array are generated by parsing the CaseFolding.txt
|
||||
** file distributed as part of the "Unicode Character Database". See
|
||||
** http://www.unicode.org for details.
|
||||
*/
|
||||
static const struct TableEntry {
|
||||
unsigned short iCode;
|
||||
unsigned char flags;
|
||||
unsigned char nRange;
|
||||
} aEntry[] = {
|
||||
{65, 14, 26}, {181, 64, 1}, {192, 14, 23},
|
||||
{216, 14, 7}, {256, 1, 48}, {306, 1, 6},
|
||||
{313, 1, 16}, {330, 1, 46}, {376, 116, 1},
|
||||
{377, 1, 6}, {383, 104, 1}, {385, 50, 1},
|
||||
{386, 1, 4}, {390, 44, 1}, {391, 0, 1},
|
||||
{393, 42, 2}, {395, 0, 1}, {398, 32, 1},
|
||||
{399, 38, 1}, {400, 40, 1}, {401, 0, 1},
|
||||
{403, 42, 1}, {404, 46, 1}, {406, 52, 1},
|
||||
{407, 48, 1}, {408, 0, 1}, {412, 52, 1},
|
||||
{413, 54, 1}, {415, 56, 1}, {416, 1, 6},
|
||||
{422, 60, 1}, {423, 0, 1}, {425, 60, 1},
|
||||
{428, 0, 1}, {430, 60, 1}, {431, 0, 1},
|
||||
{433, 58, 2}, {435, 1, 4}, {439, 62, 1},
|
||||
{440, 0, 1}, {444, 0, 1}, {452, 2, 1},
|
||||
{453, 0, 1}, {455, 2, 1}, {456, 0, 1},
|
||||
{458, 2, 1}, {459, 1, 18}, {478, 1, 18},
|
||||
{497, 2, 1}, {498, 1, 4}, {502, 122, 1},
|
||||
{503, 134, 1}, {504, 1, 40}, {544, 110, 1},
|
||||
{546, 1, 18}, {570, 70, 1}, {571, 0, 1},
|
||||
{573, 108, 1}, {574, 68, 1}, {577, 0, 1},
|
||||
{579, 106, 1}, {580, 28, 1}, {581, 30, 1},
|
||||
{582, 1, 10}, {837, 36, 1}, {880, 1, 4},
|
||||
{886, 0, 1}, {902, 18, 1}, {904, 16, 3},
|
||||
{908, 26, 1}, {910, 24, 2}, {913, 14, 17},
|
||||
{931, 14, 9}, {962, 0, 1}, {975, 4, 1},
|
||||
{976, 140, 1}, {977, 142, 1}, {981, 146, 1},
|
||||
{982, 144, 1}, {984, 1, 24}, {1008, 136, 1},
|
||||
{1009, 138, 1}, {1012, 130, 1}, {1013, 128, 1},
|
||||
{1015, 0, 1}, {1017, 152, 1}, {1018, 0, 1},
|
||||
{1021, 110, 3}, {1024, 34, 16}, {1040, 14, 32},
|
||||
{1120, 1, 34}, {1162, 1, 54}, {1216, 6, 1},
|
||||
{1217, 1, 14}, {1232, 1, 88}, {1329, 22, 38},
|
||||
{4256, 66, 38}, {4295, 66, 1}, {4301, 66, 1},
|
||||
{7680, 1, 150}, {7835, 132, 1}, {7838, 96, 1},
|
||||
{7840, 1, 96}, {7944, 150, 8}, {7960, 150, 6},
|
||||
{7976, 150, 8}, {7992, 150, 8}, {8008, 150, 6},
|
||||
{8025, 151, 8}, {8040, 150, 8}, {8072, 150, 8},
|
||||
{8088, 150, 8}, {8104, 150, 8}, {8120, 150, 2},
|
||||
{8122, 126, 2}, {8124, 148, 1}, {8126, 100, 1},
|
||||
{8136, 124, 4}, {8140, 148, 1}, {8152, 150, 2},
|
||||
{8154, 120, 2}, {8168, 150, 2}, {8170, 118, 2},
|
||||
{8172, 152, 1}, {8184, 112, 2}, {8186, 114, 2},
|
||||
{8188, 148, 1}, {8486, 98, 1}, {8490, 92, 1},
|
||||
{8491, 94, 1}, {8498, 12, 1}, {8544, 8, 16},
|
||||
{8579, 0, 1}, {9398, 10, 26}, {11264, 22, 47},
|
||||
{11360, 0, 1}, {11362, 88, 1}, {11363, 102, 1},
|
||||
{11364, 90, 1}, {11367, 1, 6}, {11373, 84, 1},
|
||||
{11374, 86, 1}, {11375, 80, 1}, {11376, 82, 1},
|
||||
{11378, 0, 1}, {11381, 0, 1}, {11390, 78, 2},
|
||||
{11392, 1, 100}, {11499, 1, 4}, {11506, 0, 1},
|
||||
{42560, 1, 46}, {42624, 1, 24}, {42786, 1, 14},
|
||||
{42802, 1, 62}, {42873, 1, 4}, {42877, 76, 1},
|
||||
{42878, 1, 10}, {42891, 0, 1}, {42893, 74, 1},
|
||||
{42896, 1, 4}, {42912, 1, 10}, {42922, 72, 1},
|
||||
{65313, 14, 26},
|
||||
};
|
||||
static const unsigned short aiOff[] = {
|
||||
1, 2, 8, 15, 16, 26, 28, 32,
|
||||
37, 38, 40, 48, 63, 64, 69, 71,
|
||||
79, 80, 116, 202, 203, 205, 206, 207,
|
||||
209, 210, 211, 213, 214, 217, 218, 219,
|
||||
775, 7264, 10792, 10795, 23228, 23256, 30204, 54721,
|
||||
54753, 54754, 54756, 54787, 54793, 54809, 57153, 57274,
|
||||
57921, 58019, 58363, 61722, 65268, 65341, 65373, 65406,
|
||||
65408, 65410, 65415, 65424, 65436, 65439, 65450, 65462,
|
||||
65472, 65476, 65478, 65480, 65482, 65488, 65506, 65511,
|
||||
65514, 65521, 65527, 65528, 65529,
|
||||
};
|
||||
|
||||
int ret = c;
|
||||
|
||||
assert( c>=0 );
|
||||
assert( sizeof(unsigned short)==2 && sizeof(unsigned char)==1 );
|
||||
|
||||
if( c<128 ){
|
||||
if( c>='A' && c<='Z' ) ret = c + ('a' - 'A');
|
||||
}else if( c<65536 ){
|
||||
int iHi = sizeof(aEntry)/sizeof(aEntry[0]) - 1;
|
||||
int iLo = 0;
|
||||
int iRes = -1;
|
||||
|
||||
while( iHi>=iLo ){
|
||||
int iTest = (iHi + iLo) / 2;
|
||||
int cmp = (c - aEntry[iTest].iCode);
|
||||
if( cmp>=0 ){
|
||||
iRes = iTest;
|
||||
iLo = iTest+1;
|
||||
}else{
|
||||
iHi = iTest-1;
|
||||
}
|
||||
}
|
||||
assert( iRes<0 || c>=aEntry[iRes].iCode );
|
||||
|
||||
if( iRes>=0 ){
|
||||
const struct TableEntry *p = &aEntry[iRes];
|
||||
if( c<(p->iCode + p->nRange) && 0==(0x01 & p->flags & (p->iCode ^ c)) ){
|
||||
ret = (c + (aiOff[p->flags>>1])) & 0x0000FFFF;
|
||||
assert( ret>0 );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
else if( c>=66560 && c<66600 ){
|
||||
ret = c + 40;
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
#endif /* defined(SQLITE_ENABLE_FTS3) || defined(SQLITE_ENABLE_FTS4) */
|
||||
#endif /* !defined(SQLITE_DISABLE_FTS3_UNICODE) */
|
@ -3174,7 +3174,12 @@ static void fts3UpdateDocTotals(
|
||||
}else{
|
||||
memset(a, 0, sizeof(u32)*(nStat) );
|
||||
}
|
||||
sqlite3_reset(pStmt);
|
||||
rc = sqlite3_reset(pStmt);
|
||||
if( rc!=SQLITE_OK ){
|
||||
sqlite3_free(a);
|
||||
*pRC = rc;
|
||||
return;
|
||||
}
|
||||
if( nChng<0 && a[0]<(u32)(-nChng) ){
|
||||
a[0] = 0;
|
||||
}else{
|
||||
|
1224
ext/fts3/unicode/CaseFolding.txt
Normal file
1224
ext/fts3/unicode/CaseFolding.txt
Normal file
File diff suppressed because it is too large
Load Diff
24428
ext/fts3/unicode/UnicodeData.txt
Normal file
24428
ext/fts3/unicode/UnicodeData.txt
Normal file
File diff suppressed because it is too large
Load Diff
574
ext/fts3/unicode/mkunicode.tcl
Normal file
574
ext/fts3/unicode/mkunicode.tcl
Normal file
@ -0,0 +1,574 @@
|
||||
|
||||
|
||||
# Parameter $zName must be a path to the file UnicodeData.txt. This command
|
||||
# reads the file and returns a list of codepoints (integers). The list
|
||||
# contains all codepoints in the UnicodeData.txt assigned to any "General
|
||||
# Category" that is not a "Letter" or "Number".
|
||||
#
|
||||
proc an_load_unicodedata_text {zName} {
|
||||
set fd [open $zName]
|
||||
set lField {
|
||||
code
|
||||
character_name
|
||||
general_category
|
||||
canonical_combining_classes
|
||||
bidirectional_category
|
||||
character_decomposition_mapping
|
||||
decimal_digit_value
|
||||
digit_value
|
||||
numeric_value
|
||||
mirrored
|
||||
unicode_1_name
|
||||
iso10646_comment_field
|
||||
uppercase_mapping
|
||||
lowercase_mapping
|
||||
titlecase_mapping
|
||||
}
|
||||
set lRet [list]
|
||||
|
||||
while { ![eof $fd] } {
|
||||
set line [gets $fd]
|
||||
if {$line == ""} continue
|
||||
|
||||
set fields [split $line ";"]
|
||||
if {[llength $fields] != [llength $lField]} { error "parse error: $line" }
|
||||
foreach $lField $fields {}
|
||||
|
||||
set iCode [expr "0x$code"]
|
||||
set bAlnum [expr {[lsearch {L N} [string range $general_category 0 0]]>=0}]
|
||||
|
||||
if { !$bAlnum } { lappend lRet $iCode }
|
||||
}
|
||||
|
||||
close $fd
|
||||
set lRet
|
||||
}
|
||||
|
||||
proc an_load_separator_ranges {} {
|
||||
global unicodedata.txt
|
||||
set lSep [an_load_unicodedata_text ${unicodedata.txt}]
|
||||
unset -nocomplain iFirst
|
||||
unset -nocomplain nRange
|
||||
set lRange [list]
|
||||
foreach sep $lSep {
|
||||
if {0==[info exists iFirst]} {
|
||||
set iFirst $sep
|
||||
set nRange 1
|
||||
} elseif { $sep == ($iFirst+$nRange) } {
|
||||
incr nRange
|
||||
} else {
|
||||
lappend lRange [list $iFirst $nRange]
|
||||
set iFirst $sep
|
||||
set nRange 1
|
||||
}
|
||||
}
|
||||
lappend lRange [list $iFirst $nRange]
|
||||
set lRange
|
||||
}
|
||||
|
||||
proc an_print_range_array {lRange} {
|
||||
set iFirstMax 0
|
||||
set nRangeMax 0
|
||||
foreach range $lRange {
|
||||
foreach {iFirst nRange} $range {}
|
||||
if {$iFirst > $iFirstMax} {set iFirstMax $iFirst}
|
||||
if {$nRange > $nRangeMax} {set nRangeMax $nRange}
|
||||
}
|
||||
if {$iFirstMax >= (1<<22)} {error "first-max is too large for format"}
|
||||
if {$nRangeMax >= (1<<10)} {error "range-max is too large for format"}
|
||||
|
||||
puts -nonewline " "
|
||||
puts [string trim {
|
||||
/* Each unsigned integer in the following array corresponds to a contiguous
|
||||
** range of unicode codepoints that are not either letters or numbers (i.e.
|
||||
** codepoints for which this function should return 0).
|
||||
**
|
||||
** The most significant 22 bits in each 32-bit value contain the first
|
||||
** codepoint in the range. The least significant 10 bits are used to store
|
||||
** the size of the range (always at least 1). In other words, the value
|
||||
** ((C<<22) + N) represents a range of N codepoints starting with codepoint
|
||||
** C. It is not possible to represent a range larger than 1023 codepoints
|
||||
** using this format.
|
||||
*/
|
||||
}]
|
||||
puts -nonewline " const static unsigned int aEntry\[\] = \{"
|
||||
set i 0
|
||||
foreach range $lRange {
|
||||
foreach {iFirst nRange} $range {}
|
||||
set u32 [format "0x%08X" [expr ($iFirst<<10) + $nRange]]
|
||||
|
||||
if {($i % 5)==0} {puts "" ; puts -nonewline " "}
|
||||
puts -nonewline " $u32,"
|
||||
incr i
|
||||
}
|
||||
puts ""
|
||||
puts " \};"
|
||||
}
|
||||
|
||||
proc an_print_ascii_bitmap {lRange} {
|
||||
foreach range $lRange {
|
||||
foreach {iFirst nRange} $range {}
|
||||
for {set i $iFirst} {$i < ($iFirst+$nRange)} {incr i} {
|
||||
if {$i<=127} { set a($i) 1 }
|
||||
}
|
||||
}
|
||||
|
||||
set aAscii [list 0 0 0 0]
|
||||
foreach key [array names a] {
|
||||
set idx [expr $key >> 5]
|
||||
lset aAscii $idx [expr [lindex $aAscii $idx] | (1 << ($key&0x001F))]
|
||||
}
|
||||
|
||||
puts " static const unsigned int aAscii\[4\] = \{"
|
||||
puts -nonewline " "
|
||||
foreach v $aAscii { puts -nonewline [format " 0x%08X," $v] }
|
||||
puts ""
|
||||
puts " \};"
|
||||
}
|
||||
|
||||
proc print_isalnum {zFunc lRange} {
|
||||
puts "/*"
|
||||
puts "** Return true if the argument corresponds to a unicode codepoint"
|
||||
puts "** classified as either a letter or a number. Otherwise false."
|
||||
puts "**"
|
||||
puts "** The results are undefined if the value passed to this function"
|
||||
puts "** is less than zero."
|
||||
puts "*/"
|
||||
puts "int ${zFunc}\(int c)\{"
|
||||
an_print_range_array $lRange
|
||||
an_print_ascii_bitmap $lRange
|
||||
puts {
|
||||
if( c<128 ){
|
||||
return ( (aAscii[c >> 5] & (1 << (c & 0x001F)))==0 );
|
||||
}else if( c<(1<<22) ){
|
||||
unsigned int key = (((unsigned int)c)<<10) | 0x000003FF;
|
||||
int iRes;
|
||||
int iHi = sizeof(aEntry)/sizeof(aEntry[0]) - 1;
|
||||
int iLo = 0;
|
||||
while( iHi>=iLo ){
|
||||
int iTest = (iHi + iLo) / 2;
|
||||
if( key >= aEntry[iTest] ){
|
||||
iRes = iTest;
|
||||
iLo = iTest+1;
|
||||
}else{
|
||||
iHi = iTest-1;
|
||||
}
|
||||
}
|
||||
assert( aEntry[0]<key );
|
||||
assert( key>=aEntry[iRes] );
|
||||
return (c >= ((aEntry[iRes]>>10) + (aEntry[iRes]&0x3FF)));
|
||||
}
|
||||
return 1;}
|
||||
puts "\}"
|
||||
}
|
||||
|
||||
proc print_test_isalnum {zFunc lRange} {
|
||||
foreach range $lRange {
|
||||
foreach {iFirst nRange} $range {}
|
||||
for {set i $iFirst} {$i < ($iFirst+$nRange)} {incr i} { set a($i) 1 }
|
||||
}
|
||||
|
||||
puts "static int isalnum_test(int *piCode)\{"
|
||||
puts -nonewline " unsigned char aAlnum\[\] = \{"
|
||||
for {set i 0} {$i < 70000} {incr i} {
|
||||
if {($i % 32)==0} { puts "" ; puts -nonewline " " }
|
||||
set bFlag [expr ![info exists a($i)]]
|
||||
puts -nonewline "${bFlag},"
|
||||
}
|
||||
puts ""
|
||||
puts " \};"
|
||||
|
||||
puts -nonewline " int aLargeSep\[\] = \{"
|
||||
set i 0
|
||||
foreach iSep [lsort -integer [array names a]] {
|
||||
if {$iSep<70000} continue
|
||||
if {($i % 8)==0} { puts "" ; puts -nonewline " " }
|
||||
puts -nonewline " $iSep,"
|
||||
incr i
|
||||
}
|
||||
puts ""
|
||||
puts " \};"
|
||||
puts -nonewline " int aLargeOther\[\] = \{"
|
||||
set i 0
|
||||
foreach iSep [lsort -integer [array names a]] {
|
||||
if {$iSep<70000} continue
|
||||
if {[info exists a([expr $iSep-1])]==0} {
|
||||
if {($i % 8)==0} { puts "" ; puts -nonewline " " }
|
||||
puts -nonewline " [expr $iSep-1],"
|
||||
incr i
|
||||
}
|
||||
if {[info exists a([expr $iSep+1])]==0} {
|
||||
if {($i % 8)==0} { puts "" ; puts -nonewline " " }
|
||||
puts -nonewline " [expr $iSep+1],"
|
||||
incr i
|
||||
}
|
||||
}
|
||||
puts ""
|
||||
puts " \};"
|
||||
|
||||
puts [subst -nocommands {
|
||||
int i;
|
||||
for(i=0; i<sizeof(aAlnum)/sizeof(aAlnum[0]); i++){
|
||||
if( ${zFunc}(i)!=aAlnum[i] ){
|
||||
*piCode = i;
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
for(i=0; i<sizeof(aLargeSep)/sizeof(aLargeSep[0]); i++){
|
||||
if( ${zFunc}(aLargeSep[i])!=0 ){
|
||||
*piCode = aLargeSep[i];
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
for(i=0; i<sizeof(aLargeOther)/sizeof(aLargeOther[0]); i++){
|
||||
if( ${zFunc}(aLargeOther[i])!=1 ){
|
||||
*piCode = aLargeOther[i];
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
}]
|
||||
puts " return 0;"
|
||||
puts "\}"
|
||||
}
|
||||
|
||||
#-------------------------------------------------------------------------
|
||||
|
||||
proc tl_load_casefolding_txt {zName} {
|
||||
global tl_lookup_table
|
||||
|
||||
set fd [open $zName]
|
||||
while { ![eof $fd] } {
|
||||
set line [gets $fd]
|
||||
if {[string range $line 0 0] == "#"} continue
|
||||
if {$line == ""} continue
|
||||
|
||||
foreach x {a b c d} {unset -nocomplain $x}
|
||||
foreach {a b c d} [split $line ";"] {}
|
||||
|
||||
set a2 [list]
|
||||
set c2 [list]
|
||||
foreach elem $a { lappend a2 [expr "0x[string trim $elem]"] }
|
||||
foreach elem $c { lappend c2 [expr "0x[string trim $elem]"] }
|
||||
set b [string trim $b]
|
||||
set d [string trim $d]
|
||||
|
||||
if {$b=="C" || $b=="S"} { set tl_lookup_table($a2) $c2 }
|
||||
}
|
||||
}
|
||||
|
||||
proc tl_create_records {} {
|
||||
global tl_lookup_table
|
||||
|
||||
set iFirst ""
|
||||
set nOff 0
|
||||
set nRange 0
|
||||
set nIncr 0
|
||||
|
||||
set lRecord [list]
|
||||
foreach code [lsort -integer [array names tl_lookup_table]] {
|
||||
set mapping $tl_lookup_table($code)
|
||||
if {$iFirst == ""} {
|
||||
set iFirst $code
|
||||
set nOff [expr $mapping - $code]
|
||||
set nRange 1
|
||||
set nIncr 1
|
||||
} else {
|
||||
set diff [expr $code - ($iFirst + ($nIncr * ($nRange - 1)))]
|
||||
if { $nRange==1 && ($diff==1 || $diff==2) } {
|
||||
set nIncr $diff
|
||||
}
|
||||
|
||||
if {$diff != $nIncr || ($mapping - $code)!=$nOff} {
|
||||
if { $nRange==1 } {set nIncr 1}
|
||||
lappend lRecord [list $iFirst $nIncr $nRange $nOff]
|
||||
set iFirst $code
|
||||
set nOff [expr $mapping - $code]
|
||||
set nRange 1
|
||||
set nIncr 1
|
||||
} else {
|
||||
incr nRange
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
lappend lRecord [list $iFirst $nIncr $nRange $nOff]
|
||||
|
||||
set lRecord
|
||||
}
|
||||
|
||||
proc tl_print_table_header {} {
|
||||
puts -nonewline " "
|
||||
puts [string trim {
|
||||
/* Each entry in the following array defines a rule for folding a range
|
||||
** of codepoints to lower case. The rule applies to a range of nRange
|
||||
** codepoints starting at codepoint iCode.
|
||||
**
|
||||
** If the least significant bit in flags is clear, then the rule applies
|
||||
** to all nRange codepoints (i.e. all nRange codepoints are upper case and
|
||||
** need to be folded). Or, if it is set, then the rule only applies to
|
||||
** every second codepoint in the range, starting with codepoint C.
|
||||
**
|
||||
** The 7 most significant bits in flags are an index into the aiOff[]
|
||||
** array. If a specific codepoint C does require folding, then its lower
|
||||
** case equivalent is ((C + aiOff[flags>>1]) & 0xFFFF).
|
||||
**
|
||||
** The contents of this array are generated by parsing the CaseFolding.txt
|
||||
** file distributed as part of the "Unicode Character Database". See
|
||||
** http://www.unicode.org for details.
|
||||
*/
|
||||
}]
|
||||
puts " static const struct TableEntry \{"
|
||||
puts " unsigned short iCode;"
|
||||
puts " unsigned char flags;"
|
||||
puts " unsigned char nRange;"
|
||||
puts " \} aEntry\[\] = \{"
|
||||
}
|
||||
|
||||
proc tl_print_table_entry {togglevar entry liOff} {
|
||||
upvar $togglevar t
|
||||
foreach {iFirst nIncr nRange nOff} $entry {}
|
||||
|
||||
if {$iFirst > (1<<16)} { return 1 }
|
||||
|
||||
if {[info exists t]==0} {set t 0}
|
||||
if {$t==0} { puts -nonewline " " }
|
||||
|
||||
set flags 0
|
||||
if {$nIncr==2} { set flags 1 ; set nRange [expr $nRange * 2]}
|
||||
if {$nOff<0} { incr nOff [expr (1<<16)] }
|
||||
|
||||
set idx [lsearch $liOff $nOff]
|
||||
if {$idx<0} {error "malfunction generating aiOff"}
|
||||
set flags [expr $flags + $idx*2]
|
||||
|
||||
set txt "{$iFirst, $flags, $nRange},"
|
||||
if {$t==2} {
|
||||
puts $txt
|
||||
} else {
|
||||
puts -nonewline [format "% -23s" $txt]
|
||||
}
|
||||
set t [expr ($t+1)%3]
|
||||
|
||||
return 0
|
||||
}
|
||||
|
||||
proc tl_print_table_footer {togglevar} {
|
||||
upvar $togglevar t
|
||||
if {$t!=0} {puts ""}
|
||||
puts " \};"
|
||||
}
|
||||
|
||||
proc tl_print_if_entry {entry} {
|
||||
foreach {iFirst nIncr nRange nOff} $entry {}
|
||||
if {$nIncr==2} {error "tl_print_if_entry needs improvement!"}
|
||||
|
||||
puts " else if( c>=$iFirst && c<[expr $iFirst+$nRange] )\{"
|
||||
puts " ret = c + $nOff;"
|
||||
puts " \}"
|
||||
}
|
||||
|
||||
proc tl_generate_ioff_table {lRecord} {
|
||||
foreach entry $lRecord {
|
||||
foreach {iFirst nIncr nRange iOff} $entry {}
|
||||
if {$iOff<0} { incr iOff [expr (1<<16)] }
|
||||
if {[info exists a($iOff)]} continue
|
||||
set a($iOff) 1
|
||||
}
|
||||
|
||||
set liOff [lsort -integer [array names a]]
|
||||
if {[llength $liOff]>128} { error "Too many distinct ioffs" }
|
||||
return $liOff
|
||||
}
|
||||
|
||||
proc tl_print_ioff_table {liOff} {
|
||||
puts -nonewline " static const unsigned short aiOff\[\] = \{"
|
||||
set i 0
|
||||
foreach off $liOff {
|
||||
if {($i % 8)==0} {puts "" ; puts -nonewline " "}
|
||||
puts -nonewline [format "% -7s" "$off,"]
|
||||
incr i
|
||||
}
|
||||
puts ""
|
||||
puts " \};"
|
||||
|
||||
}
|
||||
|
||||
proc print_tolower {zFunc} {
|
||||
|
||||
set lRecord [tl_create_records]
|
||||
|
||||
set lHigh [list]
|
||||
puts "/*"
|
||||
puts "** Interpret the argument as a unicode codepoint. If the codepoint"
|
||||
puts "** is an upper case character that has a lower case equivalent,"
|
||||
puts "** return the codepoint corresponding to the lower case version."
|
||||
puts "** Otherwise, return a copy of the argument."
|
||||
puts "**"
|
||||
puts "** The results are undefined if the value passed to this function"
|
||||
puts "** is less than zero."
|
||||
puts "*/"
|
||||
puts "int ${zFunc}\(int c)\{"
|
||||
|
||||
set liOff [tl_generate_ioff_table $lRecord]
|
||||
tl_print_table_header
|
||||
foreach entry $lRecord {
|
||||
if {[tl_print_table_entry toggle $entry $liOff]} {
|
||||
lappend lHigh $entry
|
||||
}
|
||||
}
|
||||
tl_print_table_footer toggle
|
||||
tl_print_ioff_table $liOff
|
||||
|
||||
puts {
|
||||
int ret = c;
|
||||
|
||||
assert( c>=0 );
|
||||
assert( sizeof(unsigned short)==2 && sizeof(unsigned char)==1 );
|
||||
|
||||
if( c<128 ){
|
||||
if( c>='A' && c<='Z' ) ret = c + ('a' - 'A');
|
||||
}else if( c<65536 ){
|
||||
int iHi = sizeof(aEntry)/sizeof(aEntry[0]) - 1;
|
||||
int iLo = 0;
|
||||
int iRes = -1;
|
||||
|
||||
while( iHi>=iLo ){
|
||||
int iTest = (iHi + iLo) / 2;
|
||||
int cmp = (c - aEntry[iTest].iCode);
|
||||
if( cmp>=0 ){
|
||||
iRes = iTest;
|
||||
iLo = iTest+1;
|
||||
}else{
|
||||
iHi = iTest-1;
|
||||
}
|
||||
}
|
||||
assert( iRes<0 || c>=aEntry[iRes].iCode );
|
||||
|
||||
if( iRes>=0 ){
|
||||
const struct TableEntry *p = &aEntry[iRes];
|
||||
if( c<(p->iCode + p->nRange) && 0==(0x01 & p->flags & (p->iCode ^ c)) ){
|
||||
ret = (c + (aiOff[p->flags>>1])) & 0x0000FFFF;
|
||||
assert( ret>0 );
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
foreach entry $lHigh {
|
||||
tl_print_if_entry $entry
|
||||
}
|
||||
|
||||
puts ""
|
||||
puts " return ret;"
|
||||
puts "\}"
|
||||
}
|
||||
|
||||
proc print_tolower_test {zFunc} {
|
||||
global tl_lookup_table
|
||||
|
||||
puts "static int tolower_test(int *piCode)\{"
|
||||
puts -nonewline " static int aLookup\[\] = \{"
|
||||
for {set i 0} {$i < 70000} {incr i} {
|
||||
set expected $i
|
||||
catch { set expected $tl_lookup_table($i) }
|
||||
if {($i % 8)==0} { puts "" ; puts -nonewline " " }
|
||||
puts -nonewline "$expected, "
|
||||
}
|
||||
puts " \};"
|
||||
puts " int i;"
|
||||
puts " for(i=0; i<sizeof(aLookup)/sizeof(aLookup\[0\]); i++)\{"
|
||||
puts " if( ${zFunc}\(i)!=aLookup\[i\] )\{"
|
||||
puts " *piCode = i;"
|
||||
puts " return 1;"
|
||||
puts " \}"
|
||||
puts " \}"
|
||||
puts " return 0;"
|
||||
puts "\}"
|
||||
}
|
||||
|
||||
|
||||
proc print_fileheader {} {
|
||||
puts [string trim {
|
||||
/*
|
||||
** 2012 May 25
|
||||
**
|
||||
** The author disclaims copyright to this source code. In place of
|
||||
** a legal notice, here is a blessing:
|
||||
**
|
||||
** May you do good and not evil.
|
||||
** May you find forgiveness for yourself and forgive others.
|
||||
** May you share freely, never taking more than you give.
|
||||
**
|
||||
******************************************************************************
|
||||
*/
|
||||
|
||||
/*
|
||||
** DO NOT EDIT THIS MACHINE GENERATED FILE.
|
||||
*/
|
||||
}]
|
||||
puts ""
|
||||
puts "#if !defined(SQLITE_DISABLE_FTS3_UNICODE)"
|
||||
puts "#if defined(SQLITE_ENABLE_FTS3) || defined(SQLITE_ENABLE_FTS4)"
|
||||
puts ""
|
||||
puts "#include <assert.h>"
|
||||
puts ""
|
||||
}
|
||||
|
||||
proc print_test_main {} {
|
||||
puts ""
|
||||
puts "#include <stdio.h>"
|
||||
puts ""
|
||||
puts "int main(int argc, char **argv)\{"
|
||||
puts " int r1, r2;"
|
||||
puts " int code;"
|
||||
puts " r1 = isalnum_test(&code);"
|
||||
puts " if( r1 ) printf(\"isalnum(): Problem with code %d\\n\",code);"
|
||||
puts " else printf(\"isalnum(): test passed\\n\");"
|
||||
puts " r2 = tolower_test(&code);"
|
||||
puts " if( r2 ) printf(\"tolower(): Problem with code %d\\n\",code);"
|
||||
puts " else printf(\"tolower(): test passed\\n\");"
|
||||
puts " return (r1 || r2);"
|
||||
puts "\}"
|
||||
}
|
||||
|
||||
# Proces the command line arguments. Exit early if they are not to
|
||||
# our liking.
|
||||
#
|
||||
proc usage {} {
|
||||
puts -nonewline stderr "Usage: $::argv0 ?-test? "
|
||||
puts stderr "<CaseFolding.txt file> <UnicodeData.txt file>"
|
||||
exit 1
|
||||
}
|
||||
if {[llength $argv]!=2 && [llength $argv]!=3} usage
|
||||
if {[llength $argv]==3 && [lindex $argv 0]!="-test"} usage
|
||||
set unicodedata.txt [lindex $argv end]
|
||||
set casefolding.txt [lindex $argv end-1]
|
||||
set generate_test_code [expr {[llength $argv]==3}]
|
||||
|
||||
# Print the isalnum() function to stdout.
|
||||
#
|
||||
print_fileheader
|
||||
set lRange [an_load_separator_ranges]
|
||||
print_isalnum sqlite3FtsUnicodeIsalnum $lRange
|
||||
|
||||
# Leave a gap between the two generated C functions.
|
||||
#
|
||||
puts ""
|
||||
puts ""
|
||||
|
||||
# Print the tolower() function to stdout.
|
||||
#
|
||||
tl_load_casefolding_txt ${casefolding.txt}
|
||||
print_tolower sqlite3FtsUnicodeTolower
|
||||
|
||||
# Print the test routines and main() function to stdout, if -test
|
||||
# was specified.
|
||||
#
|
||||
if {$::generate_test_code} {
|
||||
print_test_isalnum sqlite3FtsUnicodeIsalnum $lRange
|
||||
print_tolower_test sqlite3FtsUnicodeTolower
|
||||
print_test_main
|
||||
}
|
||||
|
||||
puts "#endif /* defined(SQLITE_ENABLE_FTS3) || defined(SQLITE_ENABLE_FTS4) */"
|
||||
puts "#endif /* !defined(SQLITE_DISABLE_FTS3_UNICODE) */"
|
Reference in New Issue
Block a user