1
0
mirror of https://github.com/MariaDB/server.git synced 2025-07-27 18:02:13 +03:00

WL#2575 - Fulltext: Parser plugin for FTS

Manual merge.


Makefile.am:
  Added new 'plugin' subdir.
configure.in:
  Added plugin related makefiles.
include/my_base.h:
  Added HA_OPEN_FROM_SQL_LAYER flag - indicates that a table was openned from the sql layer.
  Added HA_OPTION_RELIES_ON_SQL_LAYER flag - indicates that a table relies on the sql layer.
  Added HA_CREATE_RELIES_ON_SQL_LAYER flag - indicates that a table must be created with
  HA_OPTION_RELIES_ON_SQL_LAYER flag.
include/myisam.h:
  Distinct fulltext parser number added.
include/plugin.h:
  Revise comment.
sql/ha_myisam.cc:
  Pass HA_OPEN_FROM_SQL_LAYER flag to mi_open().
  Pass HA_CREATE_RELIES_ON_SQL_LAYER flag to mi_create().
sql/sql_plugin.cc:
  Reuse "unused" dynamic array elements.
  A check for plugin info interface version.
sql/sql_plugin.h:
  Added plugin_type_names[] - string plugin type names.
sql/sql_show.cc:
  Use plugin_type_names array instead of switch to find literal parser name representation.
sql/sql_table.cc:
  Fixed that ALTER TABLE ... ADD INDEX loses WITH PARSER info.
storage/myisam/ft_boolean_search.c:
  Call fulltext parser init() function, pass MYSQL_FTPARSER_PARAM, returned by
  ftparser_call_initializer(), to parser->parse().
storage/myisam/ft_nlq_search.c:
  Call fulltext parser init() function, pass MYSQL_FTPARSER_PARAM, returned by
  ftparser_call_initializer(), to parser->parse().
storage/myisam/ft_parser.c:
  Added two functions:
  ftparser_call_initializer() - calls parser->init() function if specified and parser is not yet
  initialized. Returns MYSQL_FTPARSER_PARAM *.
  ftparser_call_deinitializer() - calls parser->deinit() function if specified and parser was
  initialized. Deinitializes all parsers.
  ft_parse() accepts additional param now - MYSQL_FTPARSER_PARM and passes it to parser->parse().
storage/myisam/ft_update.c:
  Call fulltext parser init() function, pass MYSQL_FTPARSER_PARAM, returned by
  ftparser_call_initializer(), to _mi_ft_parse().
  _mi_ft_parse() accepts additional param now - MYSQL_FTPARSER_PARAM and passes
  it to parser->parse().
storage/myisam/ftdefs.h:
  Prototypes for new functions were added. MYSQL_FTPARSER_PARAM was added
  to ft_parse and _mi_ft_parse().
storage/myisam/mi_close.c:
  Free ftparser_param allocated by ftparser_call_initializer().
storage/myisam/mi_create.c:
  If a table relies on the sql layer, set HA_OPTION_RELIES_ON_SQL_LAYER.
storage/myisam/mi_locking.c:
  Call deinitializer for each initialized parser.
storage/myisam/mi_open.c:
  Set default values for share->ftparser and keydef->ftparser_nr.
  If a table is openned from the non-sql layer and HA_OPTION_RELIES_ON_SQL_LAYER is set, raise
  HA_ERR_UNSUPPORTED error.
storage/myisam/myisamdef.h:
  Added number of distinct parsers to MYISAM_SHARE.
  Added ftparser_param to MI_INFO.
plugin/Makefile.am:
  New BitKeeper file ``plugin/Makefile.am''
plugin/fulltext/Makefile.am:
  New BitKeeper file ``plugin/fulltext/Makefile.am''
plugin/fulltext/plugin_example.c:
  New BitKeeper file ``plugin/fulltext/plugin_example.c''
This commit is contained in:
unknown
2005-12-28 16:05:30 +04:00
parent 5bfbfb24e5
commit 38005eae6a
23 changed files with 602 additions and 140 deletions

View File

@ -66,36 +66,50 @@ struct st_mysql_plugin
/* Parsing modes. Set in MYSQL_FTPARSER_PARAM::mode */
/*
The fast and simple mode. Parser is expected to return only those words that
go into the index. Stopwords or too short/long words should not be returned.
'boolean_info' argument of mysql_add_word() does not have to be set.
Fast and simple mode. This mode is used for indexing, and natural
language queries.
This mode is used for indexing, and natural language queries.
The parser is expected to return only those words that go into the
index. Stopwords or too short/long words should not be returned. The
'boolean_info' argument of mysql_add_word() does not have to be set.
*/
#define MYSQL_FTPARSER_SIMPLE_MODE 0
/*
The parser is not allowed to ignore words in this mode. Every word should
be returned, including stopwords and words that are too short or long.
'boolean_info' argument of mysql_add_word() does not have to be set.
Parse with stopwords mode. This mode is used in boolean searches for
"phrase matching."
This mode is used in boolean searches for "phrase matching."
The parser is not allowed to ignore words in this mode. Every word
should be returned, including stopwords and words that are too short
or long. The 'boolean_info' argument of mysql_add_word() does not
have to be set.
*/
#define MYSQL_FTPARSER_WITH_STOPWORDS 1
/*
Parse in boolean mode. The parser should provide a valid
MYSQL_FTPARSER_BOOLEAN_INFO structure in the 'boolean_info' argument
to mysql_add_word(). Usually that means that the parser should
recognize boolean operators in the parsing stream and set appropriate
fields in MYSQL_FTPARSER_BOOLEAN_INFO structure accordingly. As
for MYSQL_FTPARSER_WITH_STOPWORDS mode, no word should be ignored.
Instead, use FT_TOKEN_STOPWORD for the token type of such a word.
Parse in boolean mode. This mode is used to parse a boolean query string.
This mode is used to parse a boolean query string.
The parser should provide a valid MYSQL_FTPARSER_BOOLEAN_INFO
structure in the 'boolean_info' argument to mysql_add_word().
Usually that means that the parser should recognize boolean operators
in the parsing stream and set appropriate fields in
MYSQL_FTPARSER_BOOLEAN_INFO structure accordingly. As for
MYSQL_FTPARSER_WITH_STOPWORDS mode, no word should be ignored.
Instead, use FT_TOKEN_STOPWORD for the token type of such a word.
*/
#define MYSQL_FTPARSER_FULL_BOOLEAN_INFO 2
/*
Token types for boolean mode searching (used for the type member of
MYSQL_FTPARSER_BOOLEAN_INFO struct)
FT_TOKEN_EOF: End of data.
FT_TOKEN_WORD: Regular word.
FT_TOKEN_LEFT_PAREN: Left parenthesis (start of group/sub-expression).
FT_TOKEN_RIGHT_PAREN: Right parenthesis (end of group/sub-expression).
FT_TOKEN_STOPWORD: Stopword.
*/
enum enum_ft_token_type
{
FT_TOKEN_EOF= 0,
@ -110,8 +124,27 @@ enum enum_ft_token_type
boolean-mode metadata to the MySQL search engine for every word in
the search query. A valid instance of this structure must be filled
in by the plugin parser and passed as an argument in the call to
mysql_add_word (the function from structure MYSQL_FTPARSER_PARAM)
when a query is parsed in boolean mode.
mysql_add_word (the callback function in the MYSQL_FTPARSER_PARAM
structure) when a query is parsed in boolean mode.
type: The token type. Should be one of the enum_ft_token_type values.
yesno: Whether the word must be present for a match to occur:
>0 Must be present
<0 Must not be present
0 Neither; the word is optional but its presence increases the relevance
With the default settings of the ft_boolean_syntax system variable,
>0 corresponds to the '+' operator, <0 corrresponds to the '-' operator,
and 0 means neither operator was used.
weight_adjust: A weighting factor that determines how much a match
for the word counts. Can be used to increase or decrease the word's
importance.
wasign: The sign of the weight_adjust value.
trunc: Corresponds to the '*' operator in the default setting of the
ft_boolean_syntax system variable.
*/
typedef struct st_mysql_ftparser_boolean_info
@ -129,48 +162,63 @@ typedef struct st_mysql_ftparser_boolean_info
/*
An argument of the full-text parser plugin. This structure is
filled by MySQL server and passed to the parsing function of the
filled in by MySQL server and passed to the parsing function of the
plugin as an in/out parameter.
mysql_parse: A pointer to the built-in parser implementation of the
server. It's set by the server and can be used by the parser plugin
to invoke the MySQL default parser. If plugin's role is to extract
textual data from .doc, .pdf or .xml content, it might extract
plaintext from the content, and then pass the text to the default
MySQL parser to be parsed. When mysql_parser is called, its param
argument should be given as the mysql_ftparam value.
mysql_add_word: A server callback to add a new word. When parsing
a document, the server sets this to point at a function that adds
the word to MySQL full-text index. When parsing a search query,
this function will add the new word to the list of words to search
for. When mysql_add_word is called, its param argument should be
given as the mysql_ftparam value. boolean_info can be NULL for all
cases except when mode is MYSQL_FTPARSER_FULL_BOOLEAN_INFO.
ftparser_state: A generic pointer. The plugin can set it to point
to information to be used internally for its own purposes.
mysql_ftparam: This is set by the server. It is passed as the first
argument to the mysql_parse or mysql_add_word callback. The plugin
should not modify it.
cs: Information about the character set of the document or query string.
doc: A pointer to the document or query string to be parsed.
length: Length of the document or query string, in bytes.
mode: The parsing mode. With boolean operators, with stopwords, or
nothing. See MYSQL_FTPARSER_* constants above.
*/
typedef struct st_mysql_ftparser_param
{
/*
A fallback pointer to the built-in parser implementation
of the server. It's set by the server and can be used
by the parser plugin to invoke the MySQL default parser.
If plugin's role is to extract textual data from .doc,
.pdf or .xml content, it might use the default MySQL parser
to parse the extracted plaintext string.
*/
int (*mysql_parse)(void *param, byte *doc, uint doc_len);
/*
A server callback to add a new word.
When parsing a document, the server sets this to point at
a function that adds the word to MySQL full-text index.
When parsing a search query, this function will
add the new word to the list of words to search for.
boolean_info can be NULL for all cases except
MYSQL_FTPARSER_FULL_BOOLEAN_INFO mode.
*/
int (*mysql_add_word)(void *param, byte *word, uint word_len,
MYSQL_FTPARSER_BOOLEAN_INFO *boolean_info);
/* A pointer to the parser local state. This is an inout parameter. */
void *ftparser_state;
void *mysql_ftparam;
/* Character set of the document or the query */
CHARSET_INFO *cs;
/* A pointer to the document or the query to be parsed */
byte *doc;
/* Document/query length */
uint length;
/*
Parsing mode: with boolean operators, with stopwords, or nothing.
See MYSQL_FTPARSER_* constants above.
*/
int mode;
} MYSQL_FTPARSER_PARAM;
/*
Full-text parser descriptor.
interface_version is, e.g., MYSQL_FTPARSER_INTERFACE_VERSION.
The parsing, initialization, and deinitialization functions are
invoked per SQL statement for which the parser is used.
*/
struct st_mysql_ftparser
{
int interface_version;