mirror of
https://github.com/sqlite/sqlite.git
synced 2025-08-01 06:27:03 +03:00
Add the xLanguageid method to sqlite3_fts3_tokenizer versions 1 and greater.
FossilOrigin-Name: f8e9c445dd358c40e5a7bf3756b9f291909dbea7
This commit is contained in:
@ -13,6 +13,9 @@
|
||||
** This file is not part of the production FTS code. It is only used for
|
||||
** testing. It contains a Tcl command that can be used to test if a document
|
||||
** matches an FTS NEAR expression.
|
||||
**
|
||||
** As of March 2012, it also contains a version 1 tokenizer used for testing
|
||||
** that the sqlite3_tokenizer_module.xLanguage() method is invoked correctly.
|
||||
*/
|
||||
|
||||
#include <tcl.h>
|
||||
@ -314,11 +317,207 @@ static int fts3_configure_incr_load_cmd(
|
||||
return TCL_OK;
|
||||
}
|
||||
|
||||
/**************************************************************************
|
||||
** Beginning of test tokenizer code.
|
||||
**
|
||||
** For language 0, this tokenizer is similar to the default 'simple'
|
||||
** tokenizer. For other languages L, the following:
|
||||
**
|
||||
** * Odd numbered languages are case-sensitive. Even numbered
|
||||
** languages are not.
|
||||
**
|
||||
** * Language ids 100 or greater are considered an error.
|
||||
**
|
||||
** The implementation assumes that the input contains only ASCII characters
|
||||
** (i.e. those that may be encoded in UTF-8 using a single byte).
|
||||
*/
|
||||
typedef struct test_tokenizer {
|
||||
sqlite3_tokenizer base;
|
||||
} test_tokenizer;
|
||||
|
||||
typedef struct test_tokenizer_cursor {
|
||||
sqlite3_tokenizer_cursor base;
|
||||
const char *aInput; /* Input being tokenized */
|
||||
int nInput; /* Size of the input in bytes */
|
||||
int iInput; /* Current offset in aInput */
|
||||
int iToken; /* Index of next token to be returned */
|
||||
char *aBuffer; /* Buffer containing current token */
|
||||
int nBuffer; /* Number of bytes allocated at pToken */
|
||||
int iLangid; /* Configured language id */
|
||||
} test_tokenizer_cursor;
|
||||
|
||||
static int testTokenizerCreate(
|
||||
int argc, const char * const *argv,
|
||||
sqlite3_tokenizer **ppTokenizer
|
||||
){
|
||||
test_tokenizer *pNew;
|
||||
|
||||
pNew = sqlite3_malloc(sizeof(test_tokenizer));
|
||||
if( !pNew ) return SQLITE_NOMEM;
|
||||
memset(pNew, 0, sizeof(test_tokenizer));
|
||||
|
||||
*ppTokenizer = (sqlite3_tokenizer *)pNew;
|
||||
return SQLITE_OK;
|
||||
}
|
||||
|
||||
static int testTokenizerDestroy(sqlite3_tokenizer *pTokenizer){
|
||||
test_tokenizer *p = (test_tokenizer *)pTokenizer;
|
||||
sqlite3_free(p);
|
||||
return SQLITE_OK;
|
||||
}
|
||||
|
||||
static int testTokenizerOpen(
|
||||
sqlite3_tokenizer *pTokenizer, /* The tokenizer */
|
||||
const char *pInput, int nBytes, /* String to be tokenized */
|
||||
sqlite3_tokenizer_cursor **ppCursor /* OUT: Tokenization cursor */
|
||||
){
|
||||
int rc = SQLITE_OK; /* Return code */
|
||||
test_tokenizer_cursor *pCsr; /* New cursor object */
|
||||
|
||||
UNUSED_PARAMETER(pTokenizer);
|
||||
|
||||
pCsr = (test_tokenizer_cursor *)sqlite3_malloc(sizeof(test_tokenizer_cursor));
|
||||
if( pCsr==0 ){
|
||||
rc = SQLITE_NOMEM;
|
||||
}else{
|
||||
memset(pCsr, 0, sizeof(test_tokenizer_cursor));
|
||||
pCsr->aInput = pInput;
|
||||
if( nBytes<0 ){
|
||||
pCsr->nInput = strlen(pInput);
|
||||
}else{
|
||||
pCsr->nInput = nBytes;
|
||||
}
|
||||
}
|
||||
|
||||
*ppCursor = (sqlite3_tokenizer_cursor *)pCsr;
|
||||
return rc;
|
||||
}
|
||||
|
||||
static int testTokenizerClose(sqlite3_tokenizer_cursor *pCursor){
|
||||
test_tokenizer_cursor *pCsr = (test_tokenizer_cursor *)pCursor;
|
||||
sqlite3_free(pCsr->aBuffer);
|
||||
sqlite3_free(pCsr);
|
||||
return SQLITE_OK;
|
||||
}
|
||||
|
||||
static int testIsTokenChar(char c){
|
||||
return (c>='a' && c<='z') || (c>='A' && c<='Z');
|
||||
}
|
||||
static int testTolower(char c){
|
||||
char ret = c;
|
||||
if( ret>='A' && ret<='Z') ret = ret - ('A'-'a');
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int testTokenizerNext(
|
||||
sqlite3_tokenizer_cursor *pCursor, /* Cursor returned by testTokenizerOpen */
|
||||
const char **ppToken, /* OUT: *ppToken is the token text */
|
||||
int *pnBytes, /* OUT: Number of bytes in token */
|
||||
int *piStartOffset, /* OUT: Starting offset of token */
|
||||
int *piEndOffset, /* OUT: Ending offset of token */
|
||||
int *piPosition /* OUT: Position integer of token */
|
||||
){
|
||||
test_tokenizer_cursor *pCsr = (test_tokenizer_cursor *)pCursor;
|
||||
int rc = SQLITE_OK;
|
||||
const char *p;
|
||||
const char *pEnd;
|
||||
|
||||
p = &pCsr->aInput[pCsr->iInput];
|
||||
pEnd = &pCsr->aInput[pCsr->nInput];
|
||||
|
||||
/* Skip past any white-space */
|
||||
assert( p<=pEnd );
|
||||
while( p<pEnd && testIsTokenChar(*p)==0 ) p++;
|
||||
|
||||
if( p==pEnd ){
|
||||
rc = SQLITE_DONE;
|
||||
}else{
|
||||
/* Advance to the end of the token */
|
||||
const char *pToken = p;
|
||||
int nToken;
|
||||
while( p<pEnd && testIsTokenChar(*p) ) p++;
|
||||
nToken = p-pToken;
|
||||
|
||||
/* Copy the token into the buffer */
|
||||
if( nToken>pCsr->nBuffer ){
|
||||
sqlite3_free(pCsr->aBuffer);
|
||||
pCsr->aBuffer = sqlite3_malloc(nToken);
|
||||
}
|
||||
if( pCsr->aBuffer==0 ){
|
||||
rc = SQLITE_NOMEM;
|
||||
}else{
|
||||
int i;
|
||||
|
||||
if( pCsr->iLangid & 0x00000001 ){
|
||||
for(i=0; i<nToken; i++) pCsr->aBuffer[i] = pToken[i];
|
||||
}else{
|
||||
for(i=0; i<nToken; i++) pCsr->aBuffer[i] = testTolower(pToken[i]);
|
||||
}
|
||||
pCsr->iToken++;
|
||||
pCsr->iInput = p - pCsr->aInput;
|
||||
|
||||
*ppToken = pCsr->aBuffer;
|
||||
*pnBytes = nToken;
|
||||
*piStartOffset = pToken - pCsr->aInput;
|
||||
*piEndOffset = p - pCsr->aInput;
|
||||
*piPosition = pCsr->iToken;
|
||||
}
|
||||
}
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
||||
static int testTokenizerLanguage(
|
||||
sqlite3_tokenizer_cursor *pCursor,
|
||||
int iLangid
|
||||
){
|
||||
int rc = SQLITE_OK;
|
||||
test_tokenizer_cursor *pCsr = (test_tokenizer_cursor *)pCursor;
|
||||
pCsr->iLangid = iLangid;
|
||||
if( pCsr->iLangid>=100 ){
|
||||
rc = SQLITE_ERROR;
|
||||
}
|
||||
return rc;
|
||||
}
|
||||
|
||||
static int fts3_test_tokenizer_cmd(
|
||||
ClientData clientData,
|
||||
Tcl_Interp *interp,
|
||||
int objc,
|
||||
Tcl_Obj *CONST objv[]
|
||||
){
|
||||
static const sqlite3_tokenizer_module testTokenizerModule = {
|
||||
1,
|
||||
testTokenizerCreate,
|
||||
testTokenizerDestroy,
|
||||
testTokenizerOpen,
|
||||
testTokenizerClose,
|
||||
testTokenizerNext,
|
||||
testTokenizerLanguage
|
||||
};
|
||||
const sqlite3_tokenizer_module *pPtr = &testTokenizerModule;
|
||||
if( objc!=1 ){
|
||||
Tcl_WrongNumArgs(interp, 1, objv, "");
|
||||
return TCL_ERROR;
|
||||
}
|
||||
Tcl_SetObjResult(interp, Tcl_NewByteArrayObj(
|
||||
(const unsigned char *)&pPtr, sizeof(sqlite3_tokenizer_module *)
|
||||
));
|
||||
return TCL_OK;
|
||||
}
|
||||
|
||||
/*
|
||||
** End of tokenizer code.
|
||||
**************************************************************************/
|
||||
|
||||
int Sqlitetestfts3_Init(Tcl_Interp *interp){
|
||||
Tcl_CreateObjCommand(interp, "fts3_near_match", fts3_near_match_cmd, 0, 0);
|
||||
Tcl_CreateObjCommand(interp,
|
||||
"fts3_configure_incr_load", fts3_configure_incr_load_cmd, 0, 0
|
||||
);
|
||||
Tcl_CreateObjCommand(
|
||||
interp, "fts3_test_tokenizer", fts3_test_tokenizer_cmd, 0, 0
|
||||
);
|
||||
return TCL_OK;
|
||||
}
|
||||
#endif /* ifdef SQLITE_TEST */
|
||||
|
Reference in New Issue
Block a user