mirror of
https://github.com/MariaDB/server.git
synced 2026-01-06 05:22:24 +03:00
WL#2466 - Fulltext: "always-index" words
Trie and Aho-Corasick code added to distribution. include/Makefile.am: my_trie.h added to noinst_HEADERS. mysys/Makefile.am: trie.c added to libmysys_a_SOURCES.
This commit is contained in:
@@ -28,7 +28,7 @@ noinst_HEADERS = config-win.h config-os2.h config-netware.h \
|
||||
myisam.h myisampack.h myisammrg.h ft_global.h\
|
||||
mysys_err.h my_base.h help_start.h help_end.h \
|
||||
my_nosys.h my_alarm.h queues.h rijndael.h sha1.h \
|
||||
my_aes.h my_tree.h hash.h thr_alarm.h \
|
||||
my_aes.h my_tree.h my_trie.h hash.h thr_alarm.h \
|
||||
thr_lock.h t_ctype.h violite.h md5.h \
|
||||
mysql_version.h.in my_handler.h my_time.h decimal.h
|
||||
|
||||
|
||||
142
include/my_trie.h
Normal file
142
include/my_trie.h
Normal file
@@ -0,0 +1,142 @@
|
||||
/* Copyright (C) 2005 MySQL AB
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
|
||||
|
||||
#ifndef _trie_h
|
||||
#define _trie_h
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
typedef struct st_trie_node
|
||||
{
|
||||
uint16 leaf; /* Depth from root node if match, 0 else */
|
||||
byte c; /* Label on this edge */
|
||||
struct st_trie_node *next; /* Next label */
|
||||
struct st_trie_node *links; /* Array of edges leaving this node */
|
||||
struct st_trie_node *fail; /* AC failure function */
|
||||
} TRIE_NODE;
|
||||
|
||||
typedef struct st_trie
|
||||
{
|
||||
TRIE_NODE root;
|
||||
MEM_ROOT mem_root;
|
||||
CHARSET_INFO *charset;
|
||||
uint32 nnodes;
|
||||
uint32 nwords;
|
||||
} TRIE;
|
||||
|
||||
typedef struct st_ac_trie_state
|
||||
{
|
||||
TRIE *trie;
|
||||
TRIE_NODE *node;
|
||||
} AC_TRIE_STATE;
|
||||
|
||||
extern TRIE *trie_init (TRIE *trie, CHARSET_INFO *charset);
|
||||
extern void trie_free (TRIE *trie);
|
||||
extern my_bool trie_insert (TRIE *trie, const byte *key, uint keylen);
|
||||
extern my_bool ac_trie_prepare (TRIE *trie);
|
||||
extern void ac_trie_init (TRIE *trie, AC_TRIE_STATE *state);
|
||||
|
||||
|
||||
/* `trie_goto' is internal function and shouldn't be used. */
|
||||
|
||||
static inline TRIE_NODE *trie_goto (TRIE_NODE *root, TRIE_NODE *node, byte c)
|
||||
{
|
||||
TRIE_NODE *next;
|
||||
DBUG_ENTER("trie_goto");
|
||||
for (next= node->links; next; next= next->next)
|
||||
if (next->c == c)
|
||||
DBUG_RETURN(next);
|
||||
if (root == node)
|
||||
DBUG_RETURN(root);
|
||||
DBUG_RETURN(NULL);
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
SYNOPSIS
|
||||
int ac_trie_next (AC_TRIE_STATE *state, byte *c);
|
||||
state - valid pointer to `AC_TRIE_STATE'
|
||||
c - character to lookup
|
||||
|
||||
DESCRIPTION
|
||||
Implementation of search using Aho-Corasick automaton.
|
||||
Performs char-by-char search.
|
||||
|
||||
RETURN VALUE
|
||||
`ac_trie_next' returns length of matched word or 0.
|
||||
*/
|
||||
|
||||
static inline int ac_trie_next (AC_TRIE_STATE *state, byte *c)
|
||||
{
|
||||
TRIE_NODE *root, *node;
|
||||
DBUG_ENTER("ac_trie_next");
|
||||
DBUG_ASSERT(state && c);
|
||||
root= &state->trie->root;
|
||||
node= state->node;
|
||||
while (! (state->node= trie_goto(root, node, *c)))
|
||||
node= node->fail;
|
||||
DBUG_RETURN(state->node->leaf);
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
SYNOPSIS
|
||||
my_bool trie_search (TRIE *trie, const byte *key, uint keylen);
|
||||
trie - valid pointer to `TRIE'
|
||||
key - valid pointer to key to insert
|
||||
keylen - non-0 key length
|
||||
|
||||
DESCRIPTION
|
||||
Performs key lookup in trie.
|
||||
|
||||
RETURN VALUE
|
||||
`trie_search' returns `true' if key is in `trie'. Otherwise,
|
||||
`false' is returned.
|
||||
|
||||
NOTES
|
||||
Consecutive search here is "best by test". arrays are very short, so
|
||||
binary search or hashing would add too much complexity that would
|
||||
overweight speed gain. Especially because compiler can optimize simple
|
||||
consecutive loop better (tested)
|
||||
*/
|
||||
|
||||
static inline my_bool trie_search (TRIE *trie, const byte *key, uint keylen)
|
||||
{
|
||||
TRIE_NODE *node;
|
||||
uint k;
|
||||
DBUG_ENTER("trie_search");
|
||||
DBUG_ASSERT(trie && key && keylen);
|
||||
node= &trie->root;
|
||||
|
||||
for (k= 0; k < keylen; k++)
|
||||
{
|
||||
byte p;
|
||||
if (! (node= node->links))
|
||||
DBUG_RETURN(FALSE);
|
||||
p= key[k];
|
||||
while (p != node->c)
|
||||
if (! (node= node->next))
|
||||
DBUG_RETURN(FALSE);
|
||||
}
|
||||
|
||||
DBUG_RETURN(node->leaf > 0);
|
||||
}
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
@@ -42,7 +42,7 @@ libmysys_a_SOURCES = my_init.c my_getwd.c mf_getdate.c my_mmap.c \
|
||||
mf_wcomp.c mf_wfile.c my_gethwaddr.c \
|
||||
mf_qsort.c mf_qsort2.c mf_sort.c \
|
||||
ptr_cmp.c mf_radix.c queues.c \
|
||||
tree.c list.c hash.c array.c string.c typelib.c \
|
||||
tree.c trie.c list.c hash.c array.c string.c typelib.c \
|
||||
my_copy.c my_append.c my_lib.c \
|
||||
my_delete.c my_rename.c my_redel.c my_tempnam.c \
|
||||
my_chsize.c my_lread.c my_lwrite.c my_clock.c \
|
||||
|
||||
237
mysys/trie.c
Normal file
237
mysys/trie.c
Normal file
@@ -0,0 +1,237 @@
|
||||
/* Copyright (C) 2005 MySQL AB
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
|
||||
|
||||
/*
|
||||
Implementation of trie and Aho-Corasick automaton.
|
||||
Supports only charsets that can be compared byte-wise.
|
||||
|
||||
TODO:
|
||||
Add character frequencies. Can increase lookup speed
|
||||
up to 30%.
|
||||
Implement character-wise comparision.
|
||||
*/
|
||||
|
||||
|
||||
#include "mysys_priv.h"
|
||||
#include <m_string.h>
|
||||
#include <my_trie.h>
|
||||
#include <my_base.h>
|
||||
|
||||
|
||||
/*
|
||||
SYNOPSIS
|
||||
TRIE *trie_init (TRIE *trie, CHARSET_INFO *charset);
|
||||
|
||||
DESCRIPTION
|
||||
Allocates or initializes a `TRIE' object. If `trie' is a `NULL'
|
||||
pointer, the function allocates, initializes, and returns a new
|
||||
object. Otherwise, the object is initialized and the address of
|
||||
the object is returned. If `trie_init()' allocates a new object,
|
||||
it will be freed when `trie_free()' is called.
|
||||
|
||||
RETURN VALUE
|
||||
An initialized `TRIE*' object. `NULL' if there was insufficient
|
||||
memory to allocate a new object.
|
||||
*/
|
||||
|
||||
TRIE *trie_init (TRIE *trie, CHARSET_INFO *charset)
|
||||
{
|
||||
MEM_ROOT mem_root;
|
||||
DBUG_ENTER("trie_init");
|
||||
DBUG_ASSERT(charset);
|
||||
init_alloc_root(&mem_root,
|
||||
(sizeof(TRIE_NODE) * 128) + ALLOC_ROOT_MIN_BLOCK_SIZE,
|
||||
sizeof(TRIE_NODE) * 128);
|
||||
if (! trie)
|
||||
{
|
||||
if (! (trie= (TRIE *)alloc_root(&mem_root, sizeof(TRIE))))
|
||||
{
|
||||
free_root(&mem_root, MYF(0));
|
||||
DBUG_RETURN(NULL);
|
||||
}
|
||||
}
|
||||
|
||||
memcpy(&trie->mem_root, &mem_root, sizeof(MEM_ROOT));
|
||||
trie->root.leaf= 0;
|
||||
trie->root.c= 0;
|
||||
trie->root.next= NULL;
|
||||
trie->root.links= NULL;
|
||||
trie->root.fail= NULL;
|
||||
trie->charset= charset;
|
||||
trie->nnodes= 0;
|
||||
trie->nwords= 0;
|
||||
DBUG_RETURN(trie);
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
SYNOPSIS
|
||||
void trie_free (TRIE *trie);
|
||||
trie - valid pointer to `TRIE'
|
||||
|
||||
DESCRIPTION
|
||||
Frees the memory allocated for a `trie'.
|
||||
|
||||
RETURN VALUE
|
||||
None.
|
||||
*/
|
||||
|
||||
void trie_free (TRIE *trie)
|
||||
{
|
||||
MEM_ROOT mem_root;
|
||||
DBUG_ENTER("trie_free");
|
||||
DBUG_ASSERT(trie);
|
||||
memcpy(&mem_root, &trie->mem_root, sizeof(MEM_ROOT));
|
||||
free_root(&mem_root, MYF(0));
|
||||
DBUG_VOID_RETURN;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
SYNOPSIS
|
||||
my_bool trie_insert (TRIE *trie, const byte *key, uint keylen);
|
||||
trie - valid pointer to `TRIE'
|
||||
key - valid pointer to key to insert
|
||||
keylen - non-0 key length
|
||||
|
||||
DESCRIPTION
|
||||
Inserts new key into trie.
|
||||
|
||||
RETURN VALUE
|
||||
Upon successful completion, `trie_insert' returns `FALSE'. Otherwise
|
||||
`TRUE' is returned.
|
||||
|
||||
NOTES
|
||||
If this function fails you must assume `trie' is broken.
|
||||
However it can be freed with trie_free().
|
||||
*/
|
||||
|
||||
my_bool trie_insert (TRIE *trie, const byte *key, uint keylen)
|
||||
{
|
||||
TRIE_NODE *node;
|
||||
TRIE_NODE *next;
|
||||
byte p;
|
||||
uint k;
|
||||
DBUG_ENTER("trie_insert");
|
||||
DBUG_ASSERT(trie && key && keylen);
|
||||
node= &trie->root;
|
||||
trie->root.fail= NULL;
|
||||
for (k= 0; k < keylen; k++)
|
||||
{
|
||||
p= key[k];
|
||||
for (next= node->links; next; next= next->next)
|
||||
if (next->c == p)
|
||||
break;
|
||||
|
||||
if (! next)
|
||||
{
|
||||
TRIE_NODE *tmp= (TRIE_NODE *)alloc_root(&trie->mem_root,
|
||||
sizeof(TRIE_NODE));
|
||||
if (! tmp)
|
||||
DBUG_RETURN(TRUE);
|
||||
tmp->leaf= 0;
|
||||
tmp->c= p;
|
||||
tmp->links= tmp->fail= tmp->next= NULL;
|
||||
trie->nnodes++;
|
||||
if (! node->links)
|
||||
{
|
||||
node->links= tmp;
|
||||
}
|
||||
else
|
||||
{
|
||||
for (next= node->links; next->next; next= next->next) /* no-op */;
|
||||
next->next= tmp;
|
||||
}
|
||||
node= tmp;
|
||||
}
|
||||
else
|
||||
{
|
||||
node= next;
|
||||
}
|
||||
}
|
||||
node->leaf= keylen;
|
||||
trie->nwords++;
|
||||
DBUG_RETURN(FALSE);
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
SYNOPSIS
|
||||
my_bool trie_prepare (TRIE *trie);
|
||||
trie - valid pointer to `TRIE'
|
||||
|
||||
DESCRIPTION
|
||||
Constructs Aho-Corasick automaton.
|
||||
|
||||
RETURN VALUE
|
||||
Upon successful completion, `trie_prepare' returns `FALSE'. Otherwise
|
||||
`TRUE' is returned.
|
||||
*/
|
||||
|
||||
my_bool ac_trie_prepare (TRIE *trie)
|
||||
{
|
||||
TRIE_NODE **tmp_nodes;
|
||||
TRIE_NODE *node;
|
||||
uint32 fnode= 0;
|
||||
uint32 lnode= 0;
|
||||
DBUG_ENTER("trie_prepare");
|
||||
DBUG_ASSERT(trie);
|
||||
|
||||
tmp_nodes= (TRIE_NODE **)my_malloc(trie->nnodes * sizeof(TRIE_NODE *), MYF(0));
|
||||
if (! tmp_nodes)
|
||||
DBUG_RETURN(TRUE);
|
||||
|
||||
trie->root.fail= &trie->root;
|
||||
for (node= trie->root.links; node; node= node->next)
|
||||
{
|
||||
node->fail= &trie->root;
|
||||
tmp_nodes[lnode++]= node;
|
||||
}
|
||||
|
||||
while (fnode < lnode)
|
||||
{
|
||||
TRIE_NODE *current= (TRIE_NODE *)tmp_nodes[fnode++];
|
||||
for (node= current->links; node; node= node->next)
|
||||
{
|
||||
TRIE_NODE *fail= current->fail;
|
||||
tmp_nodes[lnode++]= node;
|
||||
while (! (node->fail= trie_goto(&trie->root, fail, node->c)))
|
||||
fail= fail->fail;
|
||||
}
|
||||
}
|
||||
my_free((gptr)tmp_nodes, MYF(0));
|
||||
DBUG_RETURN(FALSE);
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
SYNOPSIS
|
||||
void ac_trie_init (TRIE *trie, AC_TRIE_STATE *state);
|
||||
trie - valid pointer to `TRIE'
|
||||
state - value pointer to `AC_TRIE_STATE'
|
||||
|
||||
DESCRIPTION
|
||||
Initializes `AC_TRIE_STATE' object.
|
||||
*/
|
||||
|
||||
void ac_trie_init (TRIE *trie, AC_TRIE_STATE *state)
|
||||
{
|
||||
DBUG_ENTER("ac_trie_init");
|
||||
DBUG_ASSERT(trie && state);
|
||||
state->trie= trie;
|
||||
state->node= &trie->root;
|
||||
DBUG_VOID_RETURN;
|
||||
}
|
||||
Reference in New Issue
Block a user