mirror of
https://github.com/sqlite/sqlite.git
synced 2025-12-24 14:17:58 +03:00
A first implementation of a full-text search module for SQLite. (CVS 3363)
FossilOrigin-Name: b0d8e0d314d6f77b7d4b5dd00c694a1323f7a8e4
This commit is contained in:
404
ext/fts1/ft_hash.c
Normal file
404
ext/fts1/ft_hash.c
Normal file
@@ -0,0 +1,404 @@
|
||||
/*
|
||||
** 2001 September 22
|
||||
**
|
||||
** The author disclaims copyright to this source code. In place of
|
||||
** a legal notice, here is a blessing:
|
||||
**
|
||||
** May you do good and not evil.
|
||||
** May you find forgiveness for yourself and forgive others.
|
||||
** May you share freely, never taking more than you give.
|
||||
**
|
||||
*************************************************************************
|
||||
** This is the implementation of generic hash-tables used in SQLite.
|
||||
** We've modified it slightly to serve as a standalone hash table
|
||||
** implementation for the full-text indexing module.
|
||||
*/
|
||||
#include <assert.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "ft_hash.h"
|
||||
|
||||
void *malloc_and_zero(int n){
|
||||
void *p = malloc(n);
|
||||
if( p ){
|
||||
memset(p, 0, n);
|
||||
}
|
||||
return p;
|
||||
}
|
||||
|
||||
/* Turn bulk memory into a hash table object by initializing the
|
||||
** fields of the Hash structure.
|
||||
**
|
||||
** "pNew" is a pointer to the hash table that is to be initialized.
|
||||
** keyClass is one of the constants HASH_INT, HASH_POINTER,
|
||||
** HASH_BINARY, or HASH_STRING. The value of keyClass
|
||||
** determines what kind of key the hash table will use. "copyKey" is
|
||||
** true if the hash table should make its own private copy of keys and
|
||||
** false if it should just use the supplied pointer. CopyKey only makes
|
||||
** sense for HASH_STRING and HASH_BINARY and is ignored
|
||||
** for other key classes.
|
||||
*/
|
||||
void HashInit(Hash *pNew, int keyClass, int copyKey){
|
||||
assert( pNew!=0 );
|
||||
assert( keyClass>=HASH_STRING && keyClass<=HASH_BINARY );
|
||||
pNew->keyClass = keyClass;
|
||||
#if 0
|
||||
if( keyClass==HASH_POINTER || keyClass==HASH_INT ) copyKey = 0;
|
||||
#endif
|
||||
pNew->copyKey = copyKey;
|
||||
pNew->first = 0;
|
||||
pNew->count = 0;
|
||||
pNew->htsize = 0;
|
||||
pNew->ht = 0;
|
||||
pNew->xMalloc = malloc_and_zero;
|
||||
pNew->xFree = free;
|
||||
}
|
||||
|
||||
/* Remove all entries from a hash table. Reclaim all memory.
|
||||
** Call this routine to delete a hash table or to reset a hash table
|
||||
** to the empty state.
|
||||
*/
|
||||
void HashClear(Hash *pH){
|
||||
HashElem *elem; /* For looping over all elements of the table */
|
||||
|
||||
assert( pH!=0 );
|
||||
elem = pH->first;
|
||||
pH->first = 0;
|
||||
if( pH->ht ) pH->xFree(pH->ht);
|
||||
pH->ht = 0;
|
||||
pH->htsize = 0;
|
||||
while( elem ){
|
||||
HashElem *next_elem = elem->next;
|
||||
if( pH->copyKey && elem->pKey ){
|
||||
pH->xFree(elem->pKey);
|
||||
}
|
||||
pH->xFree(elem);
|
||||
elem = next_elem;
|
||||
}
|
||||
pH->count = 0;
|
||||
}
|
||||
|
||||
#if 0 /* NOT USED */
|
||||
/*
|
||||
** Hash and comparison functions when the mode is HASH_INT
|
||||
*/
|
||||
static int intHash(const void *pKey, int nKey){
|
||||
return nKey ^ (nKey<<8) ^ (nKey>>8);
|
||||
}
|
||||
static int intCompare(const void *pKey1, int n1, const void *pKey2, int n2){
|
||||
return n2 - n1;
|
||||
}
|
||||
#endif
|
||||
|
||||
#if 0 /* NOT USED */
|
||||
/*
|
||||
** Hash and comparison functions when the mode is HASH_POINTER
|
||||
*/
|
||||
static int ptrHash(const void *pKey, int nKey){
|
||||
uptr x = Addr(pKey);
|
||||
return x ^ (x<<8) ^ (x>>8);
|
||||
}
|
||||
static int ptrCompare(const void *pKey1, int n1, const void *pKey2, int n2){
|
||||
if( pKey1==pKey2 ) return 0;
|
||||
if( pKey1<pKey2 ) return -1;
|
||||
return 1;
|
||||
}
|
||||
#endif
|
||||
|
||||
/*
|
||||
** Hash and comparison functions when the mode is HASH_STRING
|
||||
*/
|
||||
static int strHash(const void *pKey, int nKey){
|
||||
const char *z = (const char *)pKey;
|
||||
int h = 0;
|
||||
if( nKey<=0 ) nKey = (int) strlen(z);
|
||||
while( nKey > 0 ){
|
||||
h = (h<<3) ^ h ^ *z++;
|
||||
nKey--;
|
||||
}
|
||||
return h & 0x7fffffff;
|
||||
}
|
||||
static int strCompare(const void *pKey1, int n1, const void *pKey2, int n2){
|
||||
if( n1!=n2 ) return 1;
|
||||
return strncmp((const char*)pKey1,(const char*)pKey2,n1);
|
||||
}
|
||||
|
||||
/*
|
||||
** Hash and comparison functions when the mode is HASH_BINARY
|
||||
*/
|
||||
static int binHash(const void *pKey, int nKey){
|
||||
int h = 0;
|
||||
const char *z = (const char *)pKey;
|
||||
while( nKey-- > 0 ){
|
||||
h = (h<<3) ^ h ^ *(z++);
|
||||
}
|
||||
return h & 0x7fffffff;
|
||||
}
|
||||
static int binCompare(const void *pKey1, int n1, const void *pKey2, int n2){
|
||||
if( n1!=n2 ) return 1;
|
||||
return memcmp(pKey1,pKey2,n1);
|
||||
}
|
||||
|
||||
/*
|
||||
** Return a pointer to the appropriate hash function given the key class.
|
||||
**
|
||||
** The C syntax in this function definition may be unfamilar to some
|
||||
** programmers, so we provide the following additional explanation:
|
||||
**
|
||||
** The name of the function is "hashFunction". The function takes a
|
||||
** single parameter "keyClass". The return value of hashFunction()
|
||||
** is a pointer to another function. Specifically, the return value
|
||||
** of hashFunction() is a pointer to a function that takes two parameters
|
||||
** with types "const void*" and "int" and returns an "int".
|
||||
*/
|
||||
static int (*hashFunction(int keyClass))(const void*,int){
|
||||
#if 0 /* HASH_INT and HASH_POINTER are never used */
|
||||
switch( keyClass ){
|
||||
case HASH_INT: return &intHash;
|
||||
case HASH_POINTER: return &ptrHash;
|
||||
case HASH_STRING: return &strHash;
|
||||
case HASH_BINARY: return &binHash;;
|
||||
default: break;
|
||||
}
|
||||
return 0;
|
||||
#else
|
||||
if( keyClass==HASH_STRING ){
|
||||
return &strHash;
|
||||
}else{
|
||||
assert( keyClass==HASH_BINARY );
|
||||
return &binHash;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
/*
|
||||
** Return a pointer to the appropriate hash function given the key class.
|
||||
**
|
||||
** For help in interpreted the obscure C code in the function definition,
|
||||
** see the header comment on the previous function.
|
||||
*/
|
||||
static int (*compareFunction(int keyClass))(const void*,int,const void*,int){
|
||||
#if 0 /* HASH_INT and HASH_POINTER are never used */
|
||||
switch( keyClass ){
|
||||
case HASH_INT: return &intCompare;
|
||||
case HASH_POINTER: return &ptrCompare;
|
||||
case HASH_STRING: return &strCompare;
|
||||
case HASH_BINARY: return &binCompare;
|
||||
default: break;
|
||||
}
|
||||
return 0;
|
||||
#else
|
||||
if( keyClass==HASH_STRING ){
|
||||
return &strCompare;
|
||||
}else{
|
||||
assert( keyClass==HASH_BINARY );
|
||||
return &binCompare;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
/* Link an element into the hash table
|
||||
*/
|
||||
static void insertElement(
|
||||
Hash *pH, /* The complete hash table */
|
||||
struct _ht *pEntry, /* The entry into which pNew is inserted */
|
||||
HashElem *pNew /* The element to be inserted */
|
||||
){
|
||||
HashElem *pHead; /* First element already in pEntry */
|
||||
pHead = pEntry->chain;
|
||||
if( pHead ){
|
||||
pNew->next = pHead;
|
||||
pNew->prev = pHead->prev;
|
||||
if( pHead->prev ){ pHead->prev->next = pNew; }
|
||||
else { pH->first = pNew; }
|
||||
pHead->prev = pNew;
|
||||
}else{
|
||||
pNew->next = pH->first;
|
||||
if( pH->first ){ pH->first->prev = pNew; }
|
||||
pNew->prev = 0;
|
||||
pH->first = pNew;
|
||||
}
|
||||
pEntry->count++;
|
||||
pEntry->chain = pNew;
|
||||
}
|
||||
|
||||
|
||||
/* Resize the hash table so that it cantains "new_size" buckets.
|
||||
** "new_size" must be a power of 2. The hash table might fail
|
||||
** to resize if sqliteMalloc() fails.
|
||||
*/
|
||||
static void rehash(Hash *pH, int new_size){
|
||||
struct _ht *new_ht; /* The new hash table */
|
||||
HashElem *elem, *next_elem; /* For looping over existing elements */
|
||||
int (*xHash)(const void*,int); /* The hash function */
|
||||
|
||||
assert( (new_size & (new_size-1))==0 );
|
||||
new_ht = (struct _ht *)pH->xMalloc( new_size*sizeof(struct _ht) );
|
||||
if( new_ht==0 ) return;
|
||||
if( pH->ht ) pH->xFree(pH->ht);
|
||||
pH->ht = new_ht;
|
||||
pH->htsize = new_size;
|
||||
xHash = hashFunction(pH->keyClass);
|
||||
for(elem=pH->first, pH->first=0; elem; elem = next_elem){
|
||||
int h = (*xHash)(elem->pKey, elem->nKey) & (new_size-1);
|
||||
next_elem = elem->next;
|
||||
insertElement(pH, &new_ht[h], elem);
|
||||
}
|
||||
}
|
||||
|
||||
/* This function (for internal use only) locates an element in an
|
||||
** hash table that matches the given key. The hash for this key has
|
||||
** already been computed and is passed as the 4th parameter.
|
||||
*/
|
||||
static HashElem *findElementGivenHash(
|
||||
const Hash *pH, /* The pH to be searched */
|
||||
const void *pKey, /* The key we are searching for */
|
||||
int nKey,
|
||||
int h /* The hash for this key. */
|
||||
){
|
||||
HashElem *elem; /* Used to loop thru the element list */
|
||||
int count; /* Number of elements left to test */
|
||||
int (*xCompare)(const void*,int,const void*,int); /* comparison function */
|
||||
|
||||
if( pH->ht ){
|
||||
struct _ht *pEntry = &pH->ht[h];
|
||||
elem = pEntry->chain;
|
||||
count = pEntry->count;
|
||||
xCompare = compareFunction(pH->keyClass);
|
||||
while( count-- && elem ){
|
||||
if( (*xCompare)(elem->pKey,elem->nKey,pKey,nKey)==0 ){
|
||||
return elem;
|
||||
}
|
||||
elem = elem->next;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Remove a single entry from the hash table given a pointer to that
|
||||
** element and a hash on the element's key.
|
||||
*/
|
||||
static void removeElementGivenHash(
|
||||
Hash *pH, /* The pH containing "elem" */
|
||||
HashElem* elem, /* The element to be removed from the pH */
|
||||
int h /* Hash value for the element */
|
||||
){
|
||||
struct _ht *pEntry;
|
||||
if( elem->prev ){
|
||||
elem->prev->next = elem->next;
|
||||
}else{
|
||||
pH->first = elem->next;
|
||||
}
|
||||
if( elem->next ){
|
||||
elem->next->prev = elem->prev;
|
||||
}
|
||||
pEntry = &pH->ht[h];
|
||||
if( pEntry->chain==elem ){
|
||||
pEntry->chain = elem->next;
|
||||
}
|
||||
pEntry->count--;
|
||||
if( pEntry->count<=0 ){
|
||||
pEntry->chain = 0;
|
||||
}
|
||||
if( pH->copyKey && elem->pKey ){
|
||||
pH->xFree(elem->pKey);
|
||||
}
|
||||
pH->xFree( elem );
|
||||
pH->count--;
|
||||
if( pH->count<=0 ){
|
||||
assert( pH->first==0 );
|
||||
assert( pH->count==0 );
|
||||
HashClear(pH);
|
||||
}
|
||||
}
|
||||
|
||||
/* Attempt to locate an element of the hash table pH with a key
|
||||
** that matches pKey,nKey. Return the data for this element if it is
|
||||
** found, or NULL if there is no match.
|
||||
*/
|
||||
void *HashFind(const Hash *pH, const void *pKey, int nKey){
|
||||
int h; /* A hash on key */
|
||||
HashElem *elem; /* The element that matches key */
|
||||
int (*xHash)(const void*,int); /* The hash function */
|
||||
|
||||
if( pH==0 || pH->ht==0 ) return 0;
|
||||
xHash = hashFunction(pH->keyClass);
|
||||
assert( xHash!=0 );
|
||||
h = (*xHash)(pKey,nKey);
|
||||
assert( (pH->htsize & (pH->htsize-1))==0 );
|
||||
elem = findElementGivenHash(pH,pKey,nKey, h & (pH->htsize-1));
|
||||
return elem ? elem->data : 0;
|
||||
}
|
||||
|
||||
/* Insert an element into the hash table pH. The key is pKey,nKey
|
||||
** and the data is "data".
|
||||
**
|
||||
** If no element exists with a matching key, then a new
|
||||
** element is created. A copy of the key is made if the copyKey
|
||||
** flag is set. NULL is returned.
|
||||
**
|
||||
** If another element already exists with the same key, then the
|
||||
** new data replaces the old data and the old data is returned.
|
||||
** The key is not copied in this instance. If a malloc fails, then
|
||||
** the new data is returned and the hash table is unchanged.
|
||||
**
|
||||
** If the "data" parameter to this function is NULL, then the
|
||||
** element corresponding to "key" is removed from the hash table.
|
||||
*/
|
||||
void *HashInsert(Hash *pH, const void *pKey, int nKey, void *data){
|
||||
int hraw; /* Raw hash value of the key */
|
||||
int h; /* the hash of the key modulo hash table size */
|
||||
HashElem *elem; /* Used to loop thru the element list */
|
||||
HashElem *new_elem; /* New element added to the pH */
|
||||
int (*xHash)(const void*,int); /* The hash function */
|
||||
|
||||
assert( pH!=0 );
|
||||
xHash = hashFunction(pH->keyClass);
|
||||
assert( xHash!=0 );
|
||||
hraw = (*xHash)(pKey, nKey);
|
||||
assert( (pH->htsize & (pH->htsize-1))==0 );
|
||||
h = hraw & (pH->htsize-1);
|
||||
elem = findElementGivenHash(pH,pKey,nKey,h);
|
||||
if( elem ){
|
||||
void *old_data = elem->data;
|
||||
if( data==0 ){
|
||||
removeElementGivenHash(pH,elem,h);
|
||||
}else{
|
||||
elem->data = data;
|
||||
}
|
||||
return old_data;
|
||||
}
|
||||
if( data==0 ) return 0;
|
||||
new_elem = (HashElem*)pH->xMalloc( sizeof(HashElem) );
|
||||
if( new_elem==0 ) return data;
|
||||
if( pH->copyKey && pKey!=0 ){
|
||||
new_elem->pKey = pH->xMalloc( nKey );
|
||||
if( new_elem->pKey==0 ){
|
||||
pH->xFree(new_elem);
|
||||
return data;
|
||||
}
|
||||
memcpy((void*)new_elem->pKey, pKey, nKey);
|
||||
}else{
|
||||
new_elem->pKey = (void*)pKey;
|
||||
}
|
||||
new_elem->nKey = nKey;
|
||||
pH->count++;
|
||||
if( pH->htsize==0 ){
|
||||
rehash(pH,8);
|
||||
if( pH->htsize==0 ){
|
||||
pH->count = 0;
|
||||
pH->xFree(new_elem);
|
||||
return data;
|
||||
}
|
||||
}
|
||||
if( pH->count > pH->htsize ){
|
||||
rehash(pH,pH->htsize*2);
|
||||
}
|
||||
assert( pH->htsize>0 );
|
||||
assert( (pH->htsize & (pH->htsize-1))==0 );
|
||||
h = hraw & (pH->htsize-1);
|
||||
insertElement(pH, &pH->ht[h], new_elem);
|
||||
new_elem->data = data;
|
||||
return 0;
|
||||
}
|
||||
111
ext/fts1/ft_hash.h
Normal file
111
ext/fts1/ft_hash.h
Normal file
@@ -0,0 +1,111 @@
|
||||
/*
|
||||
** 2001 September 22
|
||||
**
|
||||
** The author disclaims copyright to this source code. In place of
|
||||
** a legal notice, here is a blessing:
|
||||
**
|
||||
** May you do good and not evil.
|
||||
** May you find forgiveness for yourself and forgive others.
|
||||
** May you share freely, never taking more than you give.
|
||||
**
|
||||
*************************************************************************
|
||||
** This is the header file for the generic hash-table implemenation
|
||||
** used in SQLite. We've modified it slightly to serve as a standalone
|
||||
** hash table implementation for the full-text indexing module.
|
||||
**
|
||||
*/
|
||||
#ifndef _HASH_H_
|
||||
#define _HASH_H_
|
||||
|
||||
/* Forward declarations of structures. */
|
||||
typedef struct Hash Hash;
|
||||
typedef struct HashElem HashElem;
|
||||
|
||||
/* A complete hash table is an instance of the following structure.
|
||||
** The internals of this structure are intended to be opaque -- client
|
||||
** code should not attempt to access or modify the fields of this structure
|
||||
** directly. Change this structure only by using the routines below.
|
||||
** However, many of the "procedures" and "functions" for modifying and
|
||||
** accessing this structure are really macros, so we can't really make
|
||||
** this structure opaque.
|
||||
*/
|
||||
struct Hash {
|
||||
char keyClass; /* HASH_INT, _POINTER, _STRING, _BINARY */
|
||||
char copyKey; /* True if copy of key made on insert */
|
||||
int count; /* Number of entries in this table */
|
||||
HashElem *first; /* The first element of the array */
|
||||
void *(*xMalloc)(int); /* malloc() function to use */
|
||||
void (*xFree)(void *); /* free() function to use */
|
||||
int htsize; /* Number of buckets in the hash table */
|
||||
struct _ht { /* the hash table */
|
||||
int count; /* Number of entries with this hash */
|
||||
HashElem *chain; /* Pointer to first entry with this hash */
|
||||
} *ht;
|
||||
};
|
||||
|
||||
/* Each element in the hash table is an instance of the following
|
||||
** structure. All elements are stored on a single doubly-linked list.
|
||||
**
|
||||
** Again, this structure is intended to be opaque, but it can't really
|
||||
** be opaque because it is used by macros.
|
||||
*/
|
||||
struct HashElem {
|
||||
HashElem *next, *prev; /* Next and previous elements in the table */
|
||||
void *data; /* Data associated with this element */
|
||||
void *pKey; int nKey; /* Key associated with this element */
|
||||
};
|
||||
|
||||
/*
|
||||
** There are 4 different modes of operation for a hash table:
|
||||
**
|
||||
** HASH_INT nKey is used as the key and pKey is ignored.
|
||||
**
|
||||
** HASH_POINTER pKey is used as the key and nKey is ignored.
|
||||
**
|
||||
** HASH_STRING pKey points to a string that is nKey bytes long
|
||||
** (including the null-terminator, if any). Case
|
||||
** is respected in comparisons.
|
||||
**
|
||||
** HASH_BINARY pKey points to binary data nKey bytes long.
|
||||
** memcmp() is used to compare keys.
|
||||
**
|
||||
** A copy of the key is made for HASH_STRING and HASH_BINARY
|
||||
** if the copyKey parameter to HashInit is 1.
|
||||
*/
|
||||
/* #define HASH_INT 1 // NOT USED */
|
||||
/* #define HASH_POINTER 2 // NOT USED */
|
||||
#define HASH_STRING 3
|
||||
#define HASH_BINARY 4
|
||||
|
||||
/*
|
||||
** Access routines. To delete, insert a NULL pointer.
|
||||
*/
|
||||
void HashInit(Hash*, int keytype, int copyKey);
|
||||
void *HashInsert(Hash*, const void *pKey, int nKey, void *pData);
|
||||
void *HashFind(const Hash*, const void *pKey, int nKey);
|
||||
void HashClear(Hash*);
|
||||
|
||||
/*
|
||||
** Macros for looping over all elements of a hash table. The idiom is
|
||||
** like this:
|
||||
**
|
||||
** Hash h;
|
||||
** HashElem *p;
|
||||
** ...
|
||||
** for(p=HashFirst(&h); p; p=HashNext(p)){
|
||||
** SomeStructure *pData = HashData(p);
|
||||
** // do something with pData
|
||||
** }
|
||||
*/
|
||||
#define HashFirst(H) ((H)->first)
|
||||
#define HashNext(E) ((E)->next)
|
||||
#define HashData(E) ((E)->data)
|
||||
#define HashKey(E) ((E)->pKey)
|
||||
#define HashKeysize(E) ((E)->nKey)
|
||||
|
||||
/*
|
||||
** Number of entries in a hash table
|
||||
*/
|
||||
#define HashCount(H) ((H)->count)
|
||||
|
||||
#endif /* _HASH_H_ */
|
||||
1503
ext/fts1/fulltext.c
Normal file
1503
ext/fts1/fulltext.c
Normal file
File diff suppressed because it is too large
Load Diff
11
ext/fts1/fulltext.h
Normal file
11
ext/fts1/fulltext.h
Normal file
@@ -0,0 +1,11 @@
|
||||
#include "sqlite3.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif /* __cplusplus */
|
||||
|
||||
int fulltext_init(sqlite3 *db);
|
||||
|
||||
#ifdef __cplusplus
|
||||
} /* extern "C" */
|
||||
#endif /* __cplusplus */
|
||||
170
ext/fts1/simple_tokenizer.c
Normal file
170
ext/fts1/simple_tokenizer.c
Normal file
@@ -0,0 +1,170 @@
|
||||
/*
|
||||
** The author disclaims copyright to this source code.
|
||||
**
|
||||
*************************************************************************
|
||||
** Implementation of the "simple" full-text-search tokenizer.
|
||||
*/
|
||||
|
||||
#include <assert.h>
|
||||
#if !defined(__APPLE__)
|
||||
#include <malloc.h>
|
||||
#else
|
||||
#include <stdlib.h>
|
||||
#endif
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <ctype.h>
|
||||
|
||||
#include "tokenizer.h"
|
||||
|
||||
/* Duplicate a string; the caller must free() the returned string.
|
||||
* (We don't use strdup() since it's not part of the standard C library and
|
||||
* may not be available everywhere.) */
|
||||
/* TODO(shess) Copied from fulltext.c, consider util.c for such
|
||||
** things. */
|
||||
static char *string_dup(const char *s){
|
||||
char *str = malloc(strlen(s) + 1);
|
||||
strcpy(str, s);
|
||||
return str;
|
||||
}
|
||||
|
||||
typedef struct simple_tokenizer {
|
||||
sqlite3_tokenizer base;
|
||||
const char *zDelim; /* token delimiters */
|
||||
} simple_tokenizer;
|
||||
|
||||
typedef struct simple_tokenizer_cursor {
|
||||
sqlite3_tokenizer_cursor base;
|
||||
const char *pInput; /* input we are tokenizing */
|
||||
int nBytes; /* size of the input */
|
||||
const char *pCurrent; /* current position in pInput */
|
||||
int iToken; /* index of next token to be returned */
|
||||
char *zToken; /* storage for current token */
|
||||
int nTokenBytes; /* actual size of current token */
|
||||
int nTokenAllocated; /* space allocated to zToken buffer */
|
||||
} simple_tokenizer_cursor;
|
||||
|
||||
static sqlite3_tokenizer_module simpleTokenizerModule;/* forward declaration */
|
||||
|
||||
static int simpleCreate(
|
||||
int argc, const char **argv,
|
||||
sqlite3_tokenizer **ppTokenizer
|
||||
){
|
||||
simple_tokenizer *t;
|
||||
|
||||
t = (simple_tokenizer *) malloc(sizeof(simple_tokenizer));
|
||||
/* TODO(shess) Delimiters need to remain the same from run to run,
|
||||
** else we need to reindex. One solution would be a meta-table to
|
||||
** track such information in the database, then we'd only want this
|
||||
** information on the initial create.
|
||||
*/
|
||||
if( argc>1 ){
|
||||
t->zDelim = string_dup(argv[1]);
|
||||
} else {
|
||||
/* Build a string of non-alphanumeric ASCII characters */
|
||||
char zDelim[128]; /* nul-terminated, so nul not a member */
|
||||
int i, j;
|
||||
for(i=1, j=0; i<0x80; i++){
|
||||
if( !isalnum(i) ){
|
||||
zDelim[j++] = i;
|
||||
}
|
||||
}
|
||||
zDelim[j++] = '\0';
|
||||
assert( j<=sizeof(zDelim) );
|
||||
t->zDelim = string_dup(zDelim);
|
||||
}
|
||||
|
||||
*ppTokenizer = &t->base;
|
||||
return SQLITE_OK;
|
||||
}
|
||||
|
||||
static int simpleDestroy(sqlite3_tokenizer *pTokenizer){
|
||||
simple_tokenizer *t = (simple_tokenizer *) pTokenizer;
|
||||
|
||||
free((void *) t->zDelim);
|
||||
free(t);
|
||||
|
||||
return SQLITE_OK;
|
||||
}
|
||||
|
||||
static int simpleOpen(
|
||||
sqlite3_tokenizer *pTokenizer,
|
||||
const char *pInput, int nBytes,
|
||||
sqlite3_tokenizer_cursor **ppCursor
|
||||
){
|
||||
simple_tokenizer_cursor *c;
|
||||
|
||||
c = (simple_tokenizer_cursor *) malloc(sizeof(simple_tokenizer_cursor));
|
||||
c->pInput = pInput;
|
||||
c->nBytes = nBytes<0 ? (int) strlen(pInput) : nBytes;
|
||||
c->pCurrent = c->pInput; /* start tokenizing at the beginning */
|
||||
c->iToken = 0;
|
||||
c->zToken = NULL; /* no space allocated, yet. */
|
||||
c->nTokenBytes = 0;
|
||||
c->nTokenAllocated = 0;
|
||||
|
||||
*ppCursor = &c->base;
|
||||
return SQLITE_OK;
|
||||
}
|
||||
|
||||
static int simpleClose(sqlite3_tokenizer_cursor *pCursor){
|
||||
simple_tokenizer_cursor *c = (simple_tokenizer_cursor *) pCursor;
|
||||
|
||||
if( NULL!=c->zToken ){
|
||||
free(c->zToken);
|
||||
}
|
||||
free(c);
|
||||
|
||||
return SQLITE_OK;
|
||||
}
|
||||
|
||||
static int simpleNext(
|
||||
sqlite3_tokenizer_cursor *pCursor,
|
||||
const char **ppToken, int *pnBytes,
|
||||
int *piStartOffset, int *piEndOffset, int *piPosition
|
||||
){
|
||||
simple_tokenizer_cursor *c = (simple_tokenizer_cursor *) pCursor;
|
||||
simple_tokenizer *t = (simple_tokenizer *) pCursor->pTokenizer;
|
||||
int ii;
|
||||
|
||||
while( c->pCurrent-c->pInput<c->nBytes ){
|
||||
int n = (int) strcspn(c->pCurrent, t->zDelim);
|
||||
if( n>0 ){
|
||||
if( n+1>c->nTokenAllocated ){
|
||||
c->zToken = realloc(c->zToken, n+1);
|
||||
}
|
||||
for(ii=0; ii<n; ii++){
|
||||
c->zToken[ii] = tolower(c->pCurrent[ii]);
|
||||
}
|
||||
c->zToken[n] = '\0';
|
||||
*ppToken = c->zToken;
|
||||
*pnBytes = n;
|
||||
*piStartOffset = (int) (c->pCurrent-c->pInput);
|
||||
*piEndOffset = *piStartOffset+n;
|
||||
*piPosition = c->iToken++;
|
||||
c->pCurrent += n + 1;
|
||||
|
||||
return SQLITE_OK;
|
||||
}
|
||||
c->pCurrent += n + 1;
|
||||
/* TODO(shess) could strspn() to skip delimiters en masse. Needs
|
||||
** to happen in two places, though, which is annoying.
|
||||
*/
|
||||
}
|
||||
return SQLITE_DONE;
|
||||
}
|
||||
|
||||
static sqlite3_tokenizer_module simpleTokenizerModule = {
|
||||
0,
|
||||
simpleCreate,
|
||||
simpleDestroy,
|
||||
simpleOpen,
|
||||
simpleClose,
|
||||
simpleNext,
|
||||
};
|
||||
|
||||
void get_simple_tokenizer_module(
|
||||
sqlite3_tokenizer_module **ppModule
|
||||
){
|
||||
*ppModule = &simpleTokenizerModule;
|
||||
}
|
||||
89
ext/fts1/tokenizer.h
Normal file
89
ext/fts1/tokenizer.h
Normal file
@@ -0,0 +1,89 @@
|
||||
/*
|
||||
** 2006 July 10
|
||||
**
|
||||
** The author disclaims copyright to this source code.
|
||||
**
|
||||
*************************************************************************
|
||||
** Defines the interface to tokenizers used by fulltext-search. There
|
||||
** are three basic components:
|
||||
**
|
||||
** sqlite3_tokenizer_module is a singleton defining the tokenizer
|
||||
** interface functions. This is essentially the class structure for
|
||||
** tokenizers.
|
||||
**
|
||||
** sqlite3_tokenizer is used to define a particular tokenizer, perhaps
|
||||
** including customization information defined at creation time.
|
||||
**
|
||||
** sqlite3_tokenizer_cursor is generated by a tokenizer to generate
|
||||
** tokens from a particular input.
|
||||
*/
|
||||
#ifndef _TOKENIZER_H_
|
||||
#define _TOKENIZER_H_
|
||||
|
||||
/* TODO(shess) Only used for SQLITE_OK and SQLITE_DONE at this time.
|
||||
** If tokenizers are to be allowed to call sqlite3_*() functions, then
|
||||
** we will need a way to register the API consistently.
|
||||
*/
|
||||
#include "sqlite3.h"
|
||||
|
||||
/*
|
||||
** Structures used by the tokenizer interface.
|
||||
*/
|
||||
typedef struct sqlite3_tokenizer sqlite3_tokenizer;
|
||||
typedef struct sqlite3_tokenizer_cursor sqlite3_tokenizer_cursor;
|
||||
typedef struct sqlite3_tokenizer_module sqlite3_tokenizer_module;
|
||||
|
||||
struct sqlite3_tokenizer_module {
|
||||
int iVersion; /* currently 0 */
|
||||
|
||||
/*
|
||||
** Create and destroy a tokenizer. argc/argv are passed down from
|
||||
** the fulltext virtual table creation to allow customization.
|
||||
*/
|
||||
int (*xCreate)(int argc, const char **argv,
|
||||
sqlite3_tokenizer **ppTokenizer);
|
||||
int (*xDestroy)(sqlite3_tokenizer *pTokenizer);
|
||||
|
||||
/*
|
||||
** Tokenize a particular input. Call xOpen() to prepare to
|
||||
** tokenize, xNext() repeatedly until it returns SQLITE_DONE, then
|
||||
** xClose() to free any internal state. The pInput passed to
|
||||
** xOpen() must exist until the cursor is closed. The ppToken
|
||||
** result from xNext() is only valid until the next call to xNext()
|
||||
** or until xClose() is called.
|
||||
*/
|
||||
/* TODO(shess) current implementation requires pInput to be
|
||||
** nul-terminated. This should either be fixed, or pInput/nBytes
|
||||
** should be converted to zInput.
|
||||
*/
|
||||
int (*xOpen)(sqlite3_tokenizer *pTokenizer,
|
||||
const char *pInput, int nBytes,
|
||||
sqlite3_tokenizer_cursor **ppCursor);
|
||||
int (*xClose)(sqlite3_tokenizer_cursor *pCursor);
|
||||
int (*xNext)(sqlite3_tokenizer_cursor *pCursor,
|
||||
const char **ppToken, int *pnBytes,
|
||||
int *piStartOffset, int *piEndOffset, int *piPosition);
|
||||
};
|
||||
|
||||
struct sqlite3_tokenizer {
|
||||
sqlite3_tokenizer_module *pModule; /* The module for this tokenizer */
|
||||
/* Tokenizer implementations will typically add additional fields */
|
||||
};
|
||||
|
||||
struct sqlite3_tokenizer_cursor {
|
||||
sqlite3_tokenizer *pTokenizer; /* Tokenizer for this cursor. */
|
||||
/* Tokenizer implementations will typically add additional fields */
|
||||
};
|
||||
|
||||
/*
|
||||
** Get the module for a tokenizer which generates tokens based on a
|
||||
** set of non-token characters. The default is to break tokens at any
|
||||
** non-alnum character, though the set of delimiters can also be
|
||||
** specified by the first argv argument to xCreate().
|
||||
*/
|
||||
/* TODO(shess) This doesn't belong here. Need some sort of
|
||||
** registration process.
|
||||
*/
|
||||
void get_simple_tokenizer_module(sqlite3_tokenizer_module **ppModule);
|
||||
|
||||
#endif /* _TOKENIZER_H_ */
|
||||
18
manifest
18
manifest
@@ -1,5 +1,5 @@
|
||||
C Add\sthe\snew\sexperimental\ssqlite3_auto_extension()\sAPI.\s(CVS\s3362)
|
||||
D 2006-08-23T20:07:21
|
||||
C A\sfirst\simplementation\sof\sa\sfull-text\ssearch\smodule\sfor\sSQLite.\s(CVS\s3363)
|
||||
D 2006-08-23T23:58:50
|
||||
F Makefile.in 8e7f9ecebab2c6e0f3db20ff129a8f9405ab64f8
|
||||
F Makefile.linux-gcc 2d8574d1ba75f129aba2019f0b959db380a90935
|
||||
F README 9c4e2d6706bdcc3efdd773ce752a8cdab4f90028
|
||||
@@ -19,6 +19,12 @@ F doc/lemon.html f0f682f50210928c07e562621c3b7e8ab912a538
|
||||
F doc/report1.txt a031aaf37b185e4fa540223cb516d3bccec7eeac
|
||||
F ext/README.txt 913a7bd3f4837ab14d7e063304181787658b14e1
|
||||
F ext/fts1/README.txt 20ac73b006a70bcfd80069bdaf59214b6cf1db5e
|
||||
F ext/fts1/ft_hash.c 3927bd880e65329bdc6f506555b228b28924921b
|
||||
F ext/fts1/ft_hash.h 1a35e654a235c2c662d3ca0dfc3138ad60b8b7d5
|
||||
F ext/fts1/fulltext.c 373ee2d9d79eb46f09e95ca01d88b6ce62f85821
|
||||
F ext/fts1/fulltext.h 08525a47852d1d62a0be81d3fc3fe2d23b094efd
|
||||
F ext/fts1/simple_tokenizer.c 289b7f35bf692e85834a7b599fd0a136e415a9eb
|
||||
F ext/fts1/tokenizer.h 0c53421b832366d20d720d21ea3e1f6e66a36ef9
|
||||
F install-sh 9d4de14ab9fb0facae2f48780b874848cbf2f895
|
||||
F ltmain.sh f6b283068efa69f06eb8aa1fe4bddfdbdeb35826
|
||||
F main.mk 22a0c92f24ffc377c41ac9d7d63ae3cbb813e532
|
||||
@@ -381,7 +387,7 @@ F www/tclsqlite.tcl bb0d1357328a42b1993d78573e587c6dcbc964b9
|
||||
F www/vdbe.tcl 87a31ace769f20d3627a64fa1fade7fed47b90d0
|
||||
F www/version3.tcl 890248cf7b70e60c383b0e84d77d5132b3ead42b
|
||||
F www/whentouse.tcl 97e2b5cd296f7d8057e11f44427dea8a4c2db513
|
||||
P 533154099c9fe1238705eea03aba388dd71dc35e
|
||||
R 6e117482355c04ea4130650a5df55269
|
||||
U drh
|
||||
Z 4fd4520cd0f576a9a4485f55b2ce4752
|
||||
P a85fc877eb8c92bbb79ac9b7fa91fb362f37cdf7
|
||||
R 04b40b5b3a8b4af8436a461b6264fde6
|
||||
U adamd
|
||||
Z e3809b70bf4ea26198337b68886699ce
|
||||
|
||||
@@ -1 +1 @@
|
||||
a85fc877eb8c92bbb79ac9b7fa91fb362f37cdf7
|
||||
b0d8e0d314d6f77b7d4b5dd00c694a1323f7a8e4
|
||||
Reference in New Issue
Block a user