1
0
mirror of https://github.com/sqlite/sqlite.git synced 2025-12-24 14:17:58 +03:00

Fix further issues to do with fts5 locale support.

FossilOrigin-Name: e626123580065986f7df50b6140f00048944becce179b9391fbf09f97ac55485
This commit is contained in:
dan
2024-08-12 17:03:37 +00:00
parent 29d5e43ad4
commit 10e54e365a
7 changed files with 109 additions and 66 deletions

View File

@@ -395,10 +395,10 @@ struct Fts5ExtensionApi {
**
** Applications may also register custom tokenizer types. A tokenizer
** is registered by providing fts5 with a populated instance of the
** following structure. Of the three structure methods, xCreate, xDelete and
** xTokenize must be supplied, any fo these three members of the
** fts5_tokenizer_v2 struct to NULL leads to undefined behaviour. The
** structure methods are expected to function as follows:
** following structure. All structure methods must be defined, setting
**
** any member of the fts5_tokenizer struct to NULL leads to undefined
** behaviour. The structure methods are expected to function as follows:
**
** xCreate:
** This function is used to allocate and initialize a tokenizer instance.
@@ -481,8 +481,8 @@ struct Fts5ExtensionApi {
** and nLocale. These specify the locale that the tokenizer should use
** for the current request. If pLocale and nLocale are both 0, then the
** tokenizer should use its default locale. Otherwise, pLocale points to
** a buffer containing the name of the locale to use as utf-8 text. nLocale
** contains the number of bytes in pLocale. pLocale is not nul-terminated.
** an nLocale byte buffer containing the name of the locale to use as utf-8
** text. pLocale is not nul-terminated.
**
** SYNONYM SUPPORT
**
@@ -658,7 +658,7 @@ struct fts5_tokenizer {
*/
typedef struct fts5_api fts5_api;
struct fts5_api {
int iVersion; /* Currently 3, was once 2 */
int iVersion; /* Currently always set to 3 */
/* Create a new tokenizer */
int (*xCreateTokenizer)(

View File

@@ -168,7 +168,7 @@ struct Fts5TokenizerConfig {
int nArg;
int ePattern; /* FTS_PATTERN_XXX constant */
const char *pLocale; /* Current locale to use */
int nLocale;
int nLocale; /* Size of pLocale in bytes */
};
/*

View File

@@ -257,8 +257,8 @@ static void fts5HighlightFunction(
sqlite3_result_text(pCtx, "", -1, SQLITE_STATIC);
rc = SQLITE_OK;
}else if( ctx.zIn ){
const char *pLoc = 0;
int nLoc = 0;
const char *pLoc = 0; /* Locale of column iCol */
int nLoc = 0; /* Size of pLoc in bytes */
if( rc==SQLITE_OK ){
rc = fts5CInstIterInit(pApi, pFts, iCol, &ctx.iter);
}
@@ -466,8 +466,8 @@ static void fts5SnippetFunction(
memset(&sFinder, 0, sizeof(Fts5SFinder));
for(i=0; i<nCol; i++){
if( iCol<0 || iCol==i ){
const char *pLoc = 0;
int nLoc = 0;
const char *pLoc = 0; /* Locale of column iCol */
int nLoc = 0; /* Size of pLoc in bytes */
int nDoc;
int nDocsize;
int ii;
@@ -536,7 +536,7 @@ static void fts5SnippetFunction(
rc = pApi->xColumnSize(pFts, iBestCol, &nColSize);
}
if( ctx.zIn ){
const char *pLoc = 0; /* Locale to tokenize in */
const char *pLoc = 0; /* Locale of column iBestCol */
int nLoc = 0; /* Bytes in pLoc */
if( rc==SQLITE_OK ){

View File

@@ -3092,8 +3092,8 @@ int sqlite3Fts5ExprPopulatePoslists(
}
}
return sqlite3Fts5Tokenize(pConfig, FTS5_TOKENIZE_DOCUMENT, z, n,
(void*)&sCtx, fts5ExprPopulatePoslistsCb
return sqlite3Fts5Tokenize(pConfig,
FTS5_TOKENIZE_DOCUMENT, z, n, (void*)&sCtx, fts5ExprPopulatePoslistsCb
);
}

View File

@@ -115,13 +115,16 @@ struct Fts5Auxiliary {
** Of course, if bV2Native is false, then x1 contains the real routines and
** x2 the synthesized ones. In this case a pointer to the Fts5TokenizerModule
** object should be passed to x2.xCreate.
**
** The synthesized wrapper routines are necessary for xFindTokenizer(_v2)
** calls.
*/
struct Fts5TokenizerModule {
char *zName; /* Name of tokenizer */
void *pUserData; /* User pointer passed to xCreate() */
int bV2Native; /* True if v2 native tokenizer */
fts5_tokenizer x1; /* Tokenizer functions */
fts5_tokenizer_v2 x2; /* Tokenizer functions */
fts5_tokenizer_v2 x2; /* V2 tokenizer functions */
void (*xDestroy)(void*); /* Destructor function */
Fts5TokenizerModule *pNext; /* Next registered tokenizer module */
};
@@ -179,12 +182,6 @@ struct Fts5Sorter {
** If the cursor iterates in descending order of rowid, iFirstRowid
** is the upper limit (i.e. the "first" rowid visited) and iLastRowid
** the lower.
**
** pLocale, nLocale:
** These are set by API method xTokenizeSetLocale(). xTokenizeSetLocale()
** does not actually configure the tokenizer, it just stores the values
** it is passed in these variables. The fts5_tokenizer_v2.xSetLocale()
** method is called from within the xTokenize() API method if required.
*/
struct Fts5Cursor {
sqlite3_vtab_cursor base; /* Base class used by SQLite core */
@@ -251,7 +248,7 @@ struct Fts5Cursor {
#define BitFlagTest(x,y) (((x) & (y))!=0)
/*
** The subtype values returned by fts5_locale() are tagged with.
** The subtype value and header bytes used by fts5_locale().
*/
#define FTS5_LOCALE_SUBTYPE ((unsigned int)'L')
#define FTS5_LOCALE_HEADER "\x00\xE0\xB2\xEB"
@@ -1255,24 +1252,24 @@ static void fts5SetVtabError(Fts5FullTable *p, const char *zFormat, ...){
}
/*
** Configure the tokenizer to use the locale specified by nLocale byte
** buffer zLocale. Return SQLITE_OK if successful, or an SQLite error
** code otherwise.
** Arrange for subsequent calls to sqlite3Fts5Tokenize() to use the locale
** specified by pLocale/nLocale. The buffer indicated by pLocale must remain
** valid until after the final call to sqlite3Fts5Tokenize() that will use
** the locale.
*/
static int fts5SetLocale(
static void fts5SetLocale(
Fts5Config *pConfig,
const char *zLocale,
int nLocale
){
int rc = SQLITE_OK;
Fts5TokenizerConfig *pT = &pConfig->t;
pT->pLocale = zLocale;
pT->nLocale = nLocale;
return rc;
}
/*
** Reset the locale of the tokenizer to its default.
** Clear any locale configured by an earlier call to fts5SetLocale() or
** sqlite3Fts5ExtractText().
*/
void sqlite3Fts5ClearLocale(Fts5Config *pConfig){
fts5SetLocale(pConfig, 0, 0);
@@ -1293,9 +1290,20 @@ void sqlite3Fts5ClearLocale(Fts5Config *pConfig){
** 1) Ordinary values. The text can be extracted from these using
** sqlite3_value_text().
**
** 2) Blobs tagged with sub-type FTS5_LOCALE_SUBTYPE, or those read from
** the content table of a normal content or external-conten table
** with locale=1 set.
** 2) Combination text/locale blobs created by fts5_locale(). There
** are several cases for these:
**
** * Blobs tagged with FTS5_LOCALE_SUBTYPE.
** * Blobs read from the content table of a locale=1 external-content
** table, and
** * Blobs read from the content table of a locale=1 regular
** content table.
**
** The first two cases above should have the 4 byte FTS5_LOCALE_HEADER
** header. It is an error if a blob with the subtype or a blob read
** from the content table of an external content table does not have
** the required header. A blob read from the content table of a regular
** locale=1 table does not have the header. This is to save space.
**
** If successful, SQLITE_OK is returned and output parameters (*ppText)
** and (*pnText) are set to point to a buffer containing the extracted utf-8
@@ -1306,11 +1314,11 @@ void sqlite3Fts5ClearLocale(Fts5Config *pConfig){
** Parameter bContent must be true if the value was read from an indexed
** column (i.e. not UNINDEXED) of the on disk content.
**
** If pbResetTokenizer is not NULL and if case (2) is used, then the
** tokenizer is configured to use the locale. In this case (*pbResetTokenizer)
** is set to true before returning, to indicate that the caller must
** call sqlite3Fts5ClearLocale() to reset the tokenizer after tokenizing
** the text.
** If pbResetTokenizer is not NULL and if case (2) is used, then
** fts5SetLocale() is called to ensure subsequent sqlite3Fts5Tokenize() calls
** use the locale. In this case (*pbResetTokenizer) is set to true before
** returning, to indicate that the caller must call sqlite3Fts5ClearLocale()
** to clear the locale after tokenizing the text.
*/
int sqlite3Fts5ExtractText(
Fts5Config *pConfig,
@@ -1367,7 +1375,7 @@ int sqlite3Fts5ExtractText(
nText = nBlob-nLocale-1;
if( pbResetTokenizer ){
rc = fts5SetLocale(pConfig, (const char*)pBlob, nLocale);
fts5SetLocale(pConfig, (const char*)pBlob, nLocale);
*pbResetTokenizer = 1;
}
}
@@ -1389,18 +1397,18 @@ int sqlite3Fts5ExtractText(
** the text of the expression, and sets output variable (*pzText) to
** point to a nul-terminated buffer containing the expression.
**
** If pVal was an fts5_locale() value, then the tokenizer has been
** configured to us the required locale.
** If pVal was an fts5_locale() value, then fts5SetLocale() is called to
** set the tokenizer to use the specified locale.
**
** If output variable (*pbFreeAndReset) is set to true, then the caller
** is required to (a) call sqlite3Fts5ClearLocale() to reset the tokenizer
** locale, and (b) call sqlite3_free() to free (*pzText).
*/
static int fts5ExtractExprText(
Fts5FullTable *pTab,
sqlite3_value *pVal,
char **pzText,
int *pbFreeAndReset
Fts5Config *pConfig, /* Fts5 configuration */
sqlite3_value *pVal, /* Value to extract expression text from */
char **pzText, /* OUT: nul-terminated buffer of text */
int *pbFreeAndReset /* OUT: Free (*pzText) and clear locale */
){
const char *zText = 0;
int nText = 0;
@@ -1408,12 +1416,12 @@ static int fts5ExtractExprText(
int bReset = 0;
*pbFreeAndReset = 0;
rc = sqlite3Fts5ExtractText(pTab->p.pConfig, pVal, 0, &bReset, &zText,&nText);
rc = sqlite3Fts5ExtractText(pConfig, pVal, 0, &bReset, &zText, &nText);
if( rc==SQLITE_OK ){
if( bReset ){
*pzText = sqlite3Fts5Mprintf(&rc, "%.*s", nText, zText);
if( rc!=SQLITE_OK ){
sqlite3Fts5ClearLocale(pTab->p.pConfig);
sqlite3Fts5ClearLocale(pConfig);
}else{
*pbFreeAndReset = 1;
}
@@ -1494,7 +1502,7 @@ static int fts5FilterMethod(
int bFreeAndReset = 0;
int bInternal = 0;
rc = fts5ExtractExprText(pTab, apVal[i], &zText, &bFreeAndReset);
rc = fts5ExtractExprText(pConfig, apVal[i], &zText, &bFreeAndReset);
if( rc!=SQLITE_OK ) goto filter_out;
if( zText==0 ) zText = "";
@@ -2124,6 +2132,9 @@ static int fts5ApiRowCount(Fts5Context *pCtx, i64 *pnRow){
return sqlite3Fts5StorageRowCount(pTab->pStorage, pnRow);
}
/*
** Implementation of xTokenize_v2() API.
*/
static int fts5ApiTokenize_v2(
Fts5Context *pCtx,
const char *pText, int nText,
@@ -2143,6 +2154,11 @@ static int fts5ApiTokenize_v2(
return rc;
}
/*
** Implementation of xTokenize() API. This is just xTokenize_v2() with NULL/0
** passed as the locale.
*/
static int fts5ApiTokenize(
Fts5Context *pCtx,
const char *pText, int nText,
@@ -2190,11 +2206,18 @@ static int fts5ApiColumnText(
return rc;
}
/*
** This is called by various API functions - xInst, xPhraseFirst,
** xPhraseFirstColumn etc. - to obtain the position list for phrase iPhrase
** of the current row. This function works for both detail=full tables (in
** which case the position-list was read from the fts index) or for other
** detail= modes if the row content is available.
*/
static int fts5CsrPoslist(
Fts5Cursor *pCsr,
int iPhrase,
const u8 **pa,
int *pn
Fts5Cursor *pCsr, /* Fts5 cursor object */
int iPhrase, /* Phrase to find position list for */
const u8 **pa, /* OUT: Pointer to position list buffer */
int *pn /* OUT: Size of (*pa) in bytes */
){
Fts5Config *pConfig = ((Fts5Table*)(pCsr->base.pVtab))->pConfig;
int rc = SQLITE_OK;
@@ -2240,7 +2263,6 @@ static int fts5CsrPoslist(
*pn = 0;
}
return rc;
}
@@ -2808,6 +2830,11 @@ static Fts5Cursor *fts5CursorFromCsrid(Fts5Global *pGlobal, i64 iCsrId){
return pCsr;
}
/*
** Parameter zFmt is a printf() style formatting string. This function
** formats it using the trailing arguments and returns the result as
** an error message to the context passed as the first argument.
*/
static void fts5ResultError(sqlite3_context *pCtx, const char *zFmt, ...){
char *zErr = 0;
va_list ap;
@@ -2931,12 +2958,13 @@ static int fts5PoslistBlob(sqlite3_context *pCtx, Fts5Cursor *pCsr){
/*
** Value pVal was read from column iCol of the FTS5 table. This function
** returns it to the owner of pCtx via a call to an sqlite3_result_xxx()
** function. This function deals with the same 3 cases as
** function. This function deals with the same cases as
** sqlite3Fts5ExtractText():
**
** 1) Ordinary values. These can be returned using sqlite3_result_value().
**
** 2) Blobs from fts5_locale().
** 2) Blobs from fts5_locale(). The text is extracted from these and
** returned via sqlite3_result_text(). The locale is discarded.
*/
static void fts5ExtractValueFromColumn(
sqlite3_context *pCtx,
@@ -3176,6 +3204,21 @@ static int fts5CreateAux(
return rc;
}
/*
** This function is used by xCreateTokenizer_v2() and xCreateTokenizer().
** It allocates and partially populates a new Fts5TokenizerModule object.
** The new object is already linked into the Fts5Global context before
** returning.
**
** If successful, SQLITE_OK is returned and a pointer to the new
** Fts5TokenizerModule object returned via output parameter (*ppNew). All
** that is required is for the caller to fill in the methods in
** Fts5TokenizerModule.x1 and x2, and to set Fts5TokenizerModule.bV2Native
** as appropriate.
**
** If an error occurs, an SQLite error code is returned and the final value
** of (*ppNew) undefined.
*/
static int fts5NewTokenizerModule(
Fts5Global *pGlobal, /* Global context (one per db handle) */
const char *zName, /* Name of new function */

View File

@@ -1,5 +1,5 @@
C Update\sthe\sporter\stokenizer\sto\suse\slocales.
D 2024-08-12T11:46:09.154
C Fix\sfurther\sissues\sto\sdo\swith\sfts5\slocale\ssupport.
D 2024-08-12T17:03:37.726
F .fossil-settings/empty-dirs dbb81e8fc0401ac46a1491ab34a7f2c7c0452f2f06b54ebb845d024ca8283ef1
F .fossil-settings/ignore-glob 35175cdfcf539b2318cb04a9901442804be81cd677d8b889fcc9149c21f239ea
F LICENSE.md df5091916dbb40e6e9686186587125e1b2ff51f022cc334e886c19a0e9982724
@@ -92,15 +92,15 @@ F ext/fts3/unicode/UnicodeData.txt cd07314edb62d49fde34debdaf92fa2aa69011e7
F ext/fts3/unicode/mkunicode.tcl d5aebf022fa4577ee8cdf27468f0d847879993959101f6dbd6348ef0cfc324a7
F ext/fts3/unicode/parseunicode.tcl a981bd6466d12dd17967515801c3ff23f74a281be1a03cf1e6f52a6959fc77eb
F ext/fts5/extract_api_docs.tcl bc3a0ca78be7d3df08e7602c00ca48021ebae40682d75eb001bfdf6e54ffb44e
F ext/fts5/fts5.h 7f1197009fc0e9822a8a584aa1f90591bdbf04f4503ecfe06949f3afe7a1fe06
F ext/fts5/fts5Int.h b40bb0bd54aaa4ac4712b6c5763b2167764614aaef204dbae81638b4548bca5d
F ext/fts5/fts5_aux.c 0d0ee62dfebe93ccf6b293edb0b21ebe5c8bdc85e962a001745f2d13ea3e79d2
F ext/fts5/fts5.h 4c6998c6186268b4dbe9baef2c0d2ab974bd90996d61d4dbe801367249be6de4
F ext/fts5/fts5Int.h 776b21159eef8d30379e5bc4627eae9e841d36e43f19dc8908c786e62aaf9e38
F ext/fts5/fts5_aux.c 12cd2512f869217c38b70c31de5b5f741812734fafa80f55b32ea9bbd96e2152
F ext/fts5/fts5_buffer.c 0eec58bff585f1a44ea9147eae5da2447292080ea435957f7488c70673cb6f09
F ext/fts5/fts5_config.c 187f7ffa5eddd6539ffa592de85e95b18be951728491390121bb215549a24a2a
F ext/fts5/fts5_expr.c ee1949c5c20901cbaca0885902f1d0c136679262dee71b457a34a92e1d16ddac
F ext/fts5/fts5_expr.c 3a24c6ab5b7545312a5ec03085ae705ede820a08f9a63f1d72829ed4a35da6f6
F ext/fts5/fts5_hash.c adda4272be401566a6e0ba1acbe70ee5cb97fce944bc2e04dc707152a0ec91b1
F ext/fts5/fts5_index.c eb9a0dda3bc6ef969a6be8d2746af56856e67251810ddba08622b45be8477abe
F ext/fts5/fts5_main.c cd61abbfd02f0f22e3c124ae2ad10c2a51cdf8acf38177410d44e134c1d1364b
F ext/fts5/fts5_main.c 4fe8349b812a9fde8e44ac5568f19d713ccc4790eb3ecb692f6551729c481b2b
F ext/fts5/fts5_storage.c 5bf88213ff5911625c142ac332ddba10dcd0869e757f91f2a3d27f27ba595992
F ext/fts5/fts5_tcl.c 50c7e16753fde0c4d80d8abd00a4ed2b0e998d5d3899a484510d01923c5da43b
F ext/fts5/fts5_test_mi.c 08c11ec968148d4cb4119d96d819f8c1f329812c568bac3684f5464be177d3ee
@@ -2207,8 +2207,8 @@ F vsixtest/vsixtest.tcl 6195aba1f12a5e10efc2b8c0009532167be5e301abe5b31385638080
F vsixtest/vsixtest.vcxproj.data 2ed517e100c66dc455b492e1a33350c1b20fbcdc
F vsixtest/vsixtest.vcxproj.filters 37e51ffedcdb064aad6ff33b6148725226cd608e
F vsixtest/vsixtest_TemporaryKey.pfx e5b1b036facdb453873e7084e1cae9102ccc67a0
P f7d56a1f2149f0da117167db62e2c28ec337e8da3403873b64cdfc6a951e2e8e
R 7151af5ed6816182b47b60322cc8dcba
P 3291ce3a3359a80e51e4546a3d7a187cbe4c7530fca6632f0bb2728525efe212
R 5bdde041363e74c68796cadade4d8480
U dan
Z a17240af0068f64d6da9a8176108962c
Z 098e7ed7a851f6658bf54618988e8ebd
# Remove this line to create a well-formed Fossil manifest.

View File

@@ -1 +1 @@
3291ce3a3359a80e51e4546a3d7a187cbe4c7530fca6632f0bb2728525efe212
e626123580065986f7df50b6140f00048944becce179b9391fbf09f97ac55485