mirror of
https://github.com/MariaDB/server.git
synced 2025-07-27 18:02:13 +03:00
WL#2575 - Fulltext: Parser plugin for FTS
Manual merge. Makefile.am: Added new 'plugin' subdir. configure.in: Added plugin related makefiles. include/my_base.h: Added HA_OPEN_FROM_SQL_LAYER flag - indicates that a table was openned from the sql layer. Added HA_OPTION_RELIES_ON_SQL_LAYER flag - indicates that a table relies on the sql layer. Added HA_CREATE_RELIES_ON_SQL_LAYER flag - indicates that a table must be created with HA_OPTION_RELIES_ON_SQL_LAYER flag. include/myisam.h: Distinct fulltext parser number added. include/plugin.h: Revise comment. sql/ha_myisam.cc: Pass HA_OPEN_FROM_SQL_LAYER flag to mi_open(). Pass HA_CREATE_RELIES_ON_SQL_LAYER flag to mi_create(). sql/sql_plugin.cc: Reuse "unused" dynamic array elements. A check for plugin info interface version. sql/sql_plugin.h: Added plugin_type_names[] - string plugin type names. sql/sql_show.cc: Use plugin_type_names array instead of switch to find literal parser name representation. sql/sql_table.cc: Fixed that ALTER TABLE ... ADD INDEX loses WITH PARSER info. storage/myisam/ft_boolean_search.c: Call fulltext parser init() function, pass MYSQL_FTPARSER_PARAM, returned by ftparser_call_initializer(), to parser->parse(). storage/myisam/ft_nlq_search.c: Call fulltext parser init() function, pass MYSQL_FTPARSER_PARAM, returned by ftparser_call_initializer(), to parser->parse(). storage/myisam/ft_parser.c: Added two functions: ftparser_call_initializer() - calls parser->init() function if specified and parser is not yet initialized. Returns MYSQL_FTPARSER_PARAM *. ftparser_call_deinitializer() - calls parser->deinit() function if specified and parser was initialized. Deinitializes all parsers. ft_parse() accepts additional param now - MYSQL_FTPARSER_PARM and passes it to parser->parse(). storage/myisam/ft_update.c: Call fulltext parser init() function, pass MYSQL_FTPARSER_PARAM, returned by ftparser_call_initializer(), to _mi_ft_parse(). _mi_ft_parse() accepts additional param now - MYSQL_FTPARSER_PARAM and passes it to parser->parse(). storage/myisam/ftdefs.h: Prototypes for new functions were added. MYSQL_FTPARSER_PARAM was added to ft_parse and _mi_ft_parse(). storage/myisam/mi_close.c: Free ftparser_param allocated by ftparser_call_initializer(). storage/myisam/mi_create.c: If a table relies on the sql layer, set HA_OPTION_RELIES_ON_SQL_LAYER. storage/myisam/mi_locking.c: Call deinitializer for each initialized parser. storage/myisam/mi_open.c: Set default values for share->ftparser and keydef->ftparser_nr. If a table is openned from the non-sql layer and HA_OPTION_RELIES_ON_SQL_LAYER is set, raise HA_ERR_UNSUPPORTED error. storage/myisam/myisamdef.h: Added number of distinct parsers to MYISAM_SHARE. Added ftparser_param to MI_INFO. plugin/Makefile.am: New BitKeeper file ``plugin/Makefile.am'' plugin/fulltext/Makefile.am: New BitKeeper file ``plugin/fulltext/Makefile.am'' plugin/fulltext/plugin_example.c: New BitKeeper file ``plugin/fulltext/plugin_example.c''
This commit is contained in:
136
include/plugin.h
136
include/plugin.h
@ -66,36 +66,50 @@ struct st_mysql_plugin
|
||||
|
||||
/* Parsing modes. Set in MYSQL_FTPARSER_PARAM::mode */
|
||||
/*
|
||||
The fast and simple mode. Parser is expected to return only those words that
|
||||
go into the index. Stopwords or too short/long words should not be returned.
|
||||
'boolean_info' argument of mysql_add_word() does not have to be set.
|
||||
Fast and simple mode. This mode is used for indexing, and natural
|
||||
language queries.
|
||||
|
||||
This mode is used for indexing, and natural language queries.
|
||||
The parser is expected to return only those words that go into the
|
||||
index. Stopwords or too short/long words should not be returned. The
|
||||
'boolean_info' argument of mysql_add_word() does not have to be set.
|
||||
*/
|
||||
#define MYSQL_FTPARSER_SIMPLE_MODE 0
|
||||
|
||||
/*
|
||||
The parser is not allowed to ignore words in this mode. Every word should
|
||||
be returned, including stopwords and words that are too short or long.
|
||||
'boolean_info' argument of mysql_add_word() does not have to be set.
|
||||
Parse with stopwords mode. This mode is used in boolean searches for
|
||||
"phrase matching."
|
||||
|
||||
This mode is used in boolean searches for "phrase matching."
|
||||
The parser is not allowed to ignore words in this mode. Every word
|
||||
should be returned, including stopwords and words that are too short
|
||||
or long. The 'boolean_info' argument of mysql_add_word() does not
|
||||
have to be set.
|
||||
*/
|
||||
#define MYSQL_FTPARSER_WITH_STOPWORDS 1
|
||||
|
||||
/*
|
||||
Parse in boolean mode. The parser should provide a valid
|
||||
MYSQL_FTPARSER_BOOLEAN_INFO structure in the 'boolean_info' argument
|
||||
to mysql_add_word(). Usually that means that the parser should
|
||||
recognize boolean operators in the parsing stream and set appropriate
|
||||
fields in MYSQL_FTPARSER_BOOLEAN_INFO structure accordingly. As
|
||||
for MYSQL_FTPARSER_WITH_STOPWORDS mode, no word should be ignored.
|
||||
Instead, use FT_TOKEN_STOPWORD for the token type of such a word.
|
||||
Parse in boolean mode. This mode is used to parse a boolean query string.
|
||||
|
||||
This mode is used to parse a boolean query string.
|
||||
The parser should provide a valid MYSQL_FTPARSER_BOOLEAN_INFO
|
||||
structure in the 'boolean_info' argument to mysql_add_word().
|
||||
Usually that means that the parser should recognize boolean operators
|
||||
in the parsing stream and set appropriate fields in
|
||||
MYSQL_FTPARSER_BOOLEAN_INFO structure accordingly. As for
|
||||
MYSQL_FTPARSER_WITH_STOPWORDS mode, no word should be ignored.
|
||||
Instead, use FT_TOKEN_STOPWORD for the token type of such a word.
|
||||
*/
|
||||
#define MYSQL_FTPARSER_FULL_BOOLEAN_INFO 2
|
||||
|
||||
/*
|
||||
Token types for boolean mode searching (used for the type member of
|
||||
MYSQL_FTPARSER_BOOLEAN_INFO struct)
|
||||
|
||||
FT_TOKEN_EOF: End of data.
|
||||
FT_TOKEN_WORD: Regular word.
|
||||
FT_TOKEN_LEFT_PAREN: Left parenthesis (start of group/sub-expression).
|
||||
FT_TOKEN_RIGHT_PAREN: Right parenthesis (end of group/sub-expression).
|
||||
FT_TOKEN_STOPWORD: Stopword.
|
||||
*/
|
||||
|
||||
enum enum_ft_token_type
|
||||
{
|
||||
FT_TOKEN_EOF= 0,
|
||||
@ -110,8 +124,27 @@ enum enum_ft_token_type
|
||||
boolean-mode metadata to the MySQL search engine for every word in
|
||||
the search query. A valid instance of this structure must be filled
|
||||
in by the plugin parser and passed as an argument in the call to
|
||||
mysql_add_word (the function from structure MYSQL_FTPARSER_PARAM)
|
||||
when a query is parsed in boolean mode.
|
||||
mysql_add_word (the callback function in the MYSQL_FTPARSER_PARAM
|
||||
structure) when a query is parsed in boolean mode.
|
||||
|
||||
type: The token type. Should be one of the enum_ft_token_type values.
|
||||
|
||||
yesno: Whether the word must be present for a match to occur:
|
||||
>0 Must be present
|
||||
<0 Must not be present
|
||||
0 Neither; the word is optional but its presence increases the relevance
|
||||
With the default settings of the ft_boolean_syntax system variable,
|
||||
>0 corresponds to the '+' operator, <0 corrresponds to the '-' operator,
|
||||
and 0 means neither operator was used.
|
||||
|
||||
weight_adjust: A weighting factor that determines how much a match
|
||||
for the word counts. Can be used to increase or decrease the word's
|
||||
importance.
|
||||
|
||||
wasign: The sign of the weight_adjust value.
|
||||
|
||||
trunc: Corresponds to the '*' operator in the default setting of the
|
||||
ft_boolean_syntax system variable.
|
||||
*/
|
||||
|
||||
typedef struct st_mysql_ftparser_boolean_info
|
||||
@ -129,48 +162,63 @@ typedef struct st_mysql_ftparser_boolean_info
|
||||
|
||||
/*
|
||||
An argument of the full-text parser plugin. This structure is
|
||||
filled by MySQL server and passed to the parsing function of the
|
||||
filled in by MySQL server and passed to the parsing function of the
|
||||
plugin as an in/out parameter.
|
||||
|
||||
mysql_parse: A pointer to the built-in parser implementation of the
|
||||
server. It's set by the server and can be used by the parser plugin
|
||||
to invoke the MySQL default parser. If plugin's role is to extract
|
||||
textual data from .doc, .pdf or .xml content, it might extract
|
||||
plaintext from the content, and then pass the text to the default
|
||||
MySQL parser to be parsed. When mysql_parser is called, its param
|
||||
argument should be given as the mysql_ftparam value.
|
||||
|
||||
mysql_add_word: A server callback to add a new word. When parsing
|
||||
a document, the server sets this to point at a function that adds
|
||||
the word to MySQL full-text index. When parsing a search query,
|
||||
this function will add the new word to the list of words to search
|
||||
for. When mysql_add_word is called, its param argument should be
|
||||
given as the mysql_ftparam value. boolean_info can be NULL for all
|
||||
cases except when mode is MYSQL_FTPARSER_FULL_BOOLEAN_INFO.
|
||||
|
||||
ftparser_state: A generic pointer. The plugin can set it to point
|
||||
to information to be used internally for its own purposes.
|
||||
|
||||
mysql_ftparam: This is set by the server. It is passed as the first
|
||||
argument to the mysql_parse or mysql_add_word callback. The plugin
|
||||
should not modify it.
|
||||
|
||||
cs: Information about the character set of the document or query string.
|
||||
|
||||
doc: A pointer to the document or query string to be parsed.
|
||||
|
||||
length: Length of the document or query string, in bytes.
|
||||
|
||||
mode: The parsing mode. With boolean operators, with stopwords, or
|
||||
nothing. See MYSQL_FTPARSER_* constants above.
|
||||
*/
|
||||
|
||||
typedef struct st_mysql_ftparser_param
|
||||
{
|
||||
/*
|
||||
A fallback pointer to the built-in parser implementation
|
||||
of the server. It's set by the server and can be used
|
||||
by the parser plugin to invoke the MySQL default parser.
|
||||
If plugin's role is to extract textual data from .doc,
|
||||
.pdf or .xml content, it might use the default MySQL parser
|
||||
to parse the extracted plaintext string.
|
||||
*/
|
||||
int (*mysql_parse)(void *param, byte *doc, uint doc_len);
|
||||
/*
|
||||
A server callback to add a new word.
|
||||
When parsing a document, the server sets this to point at
|
||||
a function that adds the word to MySQL full-text index.
|
||||
When parsing a search query, this function will
|
||||
add the new word to the list of words to search for.
|
||||
boolean_info can be NULL for all cases except
|
||||
MYSQL_FTPARSER_FULL_BOOLEAN_INFO mode.
|
||||
*/
|
||||
int (*mysql_add_word)(void *param, byte *word, uint word_len,
|
||||
MYSQL_FTPARSER_BOOLEAN_INFO *boolean_info);
|
||||
/* A pointer to the parser local state. This is an inout parameter. */
|
||||
void *ftparser_state;
|
||||
void *mysql_ftparam;
|
||||
/* Character set of the document or the query */
|
||||
CHARSET_INFO *cs;
|
||||
/* A pointer to the document or the query to be parsed */
|
||||
byte *doc;
|
||||
/* Document/query length */
|
||||
uint length;
|
||||
/*
|
||||
Parsing mode: with boolean operators, with stopwords, or nothing.
|
||||
See MYSQL_FTPARSER_* constants above.
|
||||
*/
|
||||
int mode;
|
||||
} MYSQL_FTPARSER_PARAM;
|
||||
|
||||
/*
|
||||
Full-text parser descriptor.
|
||||
|
||||
interface_version is, e.g., MYSQL_FTPARSER_INTERFACE_VERSION.
|
||||
The parsing, initialization, and deinitialization functions are
|
||||
invoked per SQL statement for which the parser is used.
|
||||
*/
|
||||
|
||||
struct st_mysql_ftparser
|
||||
{
|
||||
int interface_version;
|
||||
|
Reference in New Issue
Block a user