diff --git a/mysql-test/suite/innodb_fts/r/create.result b/mysql-test/suite/innodb_fts/r/create.result new file mode 100644 index 00000000000..c537aa81efd --- /dev/null +++ b/mysql-test/suite/innodb_fts/r/create.result @@ -0,0 +1,168 @@ +SET NAMES utf8mb4; +# +# MDEV-11233 CREATE FULLTEXT INDEX with a token +# longer than 127 bytes crashes server +# +CREATE TABLE t(t TEXT CHARACTER SET utf8mb3) ENGINE=InnoDB; +INSERT INTO t SET t=REPEAT(CONCAT(REPEAT(_utf8mb3 0xE0B987, 4), REPEAT(_utf8mb3 0xE0B989, 5)), 5); +INSERT INTO t SET t=REPEAT(_utf8 0xefbc90,84); +INSERT INTO t SET t=REPEAT('befor',17); +INSERT INTO t SET t='BeforeTheIndexCreation'; +CREATE FULLTEXT INDEX ft ON t(t); +Warnings: +Warning 124 InnoDB rebuilding table to add column FTS_DOC_ID +INSERT INTO t SET t='this was inserted after creating the index'; +INSERT INTO t SET t=REPEAT(_utf8 0xefbc91,84); +INSERT INTO t SET t=REPEAT('after',17); +INSERT INTO t SET t=REPEAT(_utf8mb3 0xe794b2e9aaa8e69687, 15); +# The data below is not 3-byte UTF-8, but 4-byte chars. +INSERT INTO t SET t=REPEAT(_utf8mb4 0xf09f9695, 84); +Warnings: +Warning 1366 Incorrect string value: '\xF0\x9F\x96\x95\xF0\x9F...' for column 't' at row 1 +INSERT INTO t SET t=REPEAT(_utf8mb4 0xf09f9696, 85); +Warnings: +Warning 1366 Incorrect string value: '\xF0\x9F\x96\x96\xF0\x9F...' for column 't' at row 1 +SELECT COUNT(*) FROM t WHERE MATCH t AGAINST +(REPEAT(CONCAT(REPEAT(_utf8mb3 0xE0B987, 4), REPEAT(_utf8mb3 0xE0B989, 5)), 5)); +COUNT(*) +1 +SELECT COUNT(*) FROM t WHERE MATCH t AGAINST ('BeforeTheIndexCreation'); +COUNT(*) +1 +SELECT COUNT(*) FROM t WHERE MATCH t AGAINST (REPEAT('befor',17)); +COUNT(*) +0 +SELECT COUNT(*) FROM t WHERE MATCH t AGAINST ('after'); +COUNT(*) +1 +SELECT COUNT(*) FROM t WHERE MATCH t AGAINST (REPEAT('after',17)); +COUNT(*) +0 +SELECT COUNT(*) FROM t WHERE MATCH t AGAINST (REPEAT(_utf8 0xefbc90, 83)); +COUNT(*) +0 +SELECT COUNT(*) FROM t WHERE MATCH t AGAINST (REPEAT(_utf8 0xefbc90, 84)); +COUNT(*) +1 +SELECT COUNT(*) FROM t WHERE MATCH t AGAINST (REPEAT(_utf8 0xefbc90, 85)); +COUNT(*) +0 +SELECT COUNT(*) FROM t WHERE MATCH t AGAINST (REPEAT(_utf8 0xefbc91, 83)); +COUNT(*) +0 +SELECT COUNT(*) FROM t WHERE MATCH t AGAINST (REPEAT(_utf8 0xefbc91, 84)); +COUNT(*) +1 +SELECT COUNT(*) FROM t WHERE MATCH t AGAINST (REPEAT(_utf8 0xefbc91, 85)); +COUNT(*) +0 +SELECT COUNT(*) FROM t WHERE MATCH t AGAINST (REPEAT(_utf8mb4 0xf09f9695, 83)); +COUNT(*) +0 +SELECT COUNT(*) FROM t WHERE MATCH t AGAINST (REPEAT(_utf8mb4 0xf09f9695, 84)); +COUNT(*) +0 +SELECT COUNT(*) FROM t WHERE MATCH t AGAINST (REPEAT(_utf8mb4 0xf09f9696, 84)); +COUNT(*) +0 +SELECT COUNT(*) FROM t WHERE MATCH t AGAINST (REPEAT(_utf8mb4 0xf09f9696, 85)); +COUNT(*) +0 +SELECT * FROM t; +t +็็็็้้้้้็็็็้้้้้็็็็้้้้้็็็็้้้้้็็็็้้้้้ +000000000000000000000000000000000000000000000000000000000000000000000000000000000000 +beforbeforbeforbeforbeforbeforbeforbeforbeforbeforbeforbeforbeforbeforbeforbeforbefor +BeforeTheIndexCreation +this was inserted after creating the index +111111111111111111111111111111111111111111111111111111111111111111111111111111111111 +afterafterafterafterafterafterafterafterafterafterafterafterafterafterafterafterafter +甲骨文甲骨文甲骨文甲骨文甲骨文甲骨文甲骨文甲骨文甲骨文甲骨文甲骨文甲骨文甲骨文甲骨文甲骨文 +???????????????????????????????????????????????????????????????????????????????????? +????????????????????????????????????????????????????????????????????????????????????? +SELECT len,COUNT(*) FROM INFORMATION_SCHEMA.INNODB_SYS_COLUMNS where name='word' GROUP BY len; +len COUNT(*) +252 6 +DROP TABLE t; +CREATE TABLE t(t TEXT CHARACTER SET utf8mb4) ENGINE=InnoDB; +INSERT INTO t SET t=REPEAT(_utf8mb3 0xe794b2e9aaa8e69687, 15); +INSERT INTO t SET t=REPEAT(_utf8 0xefbc90,84); +INSERT INTO t SET t=REPEAT('befor',17); +INSERT INTO t SET t='BeforeTheIndexCreation'; +CREATE FULLTEXT INDEX ft ON t(t); +Warnings: +Warning 124 InnoDB rebuilding table to add column FTS_DOC_ID +INSERT INTO t SET t='this was inserted after creating the index'; +INSERT INTO t SET t=REPEAT(_utf8 0xefbc91,84); +INSERT INTO t SET t=REPEAT('after',17); +INSERT INTO t SET t=REPEAT(concat(repeat(_utf8mb3 0xE0B987, 4), repeat(_utf8mb3 0xE0B989, 5)), 5); +INSERT INTO t SET t=REPEAT(_utf8mb4 0xf09f9695, 84); +# The token below exceeds the 84-character limit. +INSERT INTO t SET t=REPEAT(_utf8mb4 0xf09f9696, 85); +SELECT COUNT(*) FROM t WHERE MATCH t AGAINST (REPEAT(_utf8mb3 0xe794b2e9aaa8e69687, 15)); +COUNT(*) +1 +SELECT COUNT(*) FROM t WHERE MATCH t AGAINST ('BeforeTheIndexCreation'); +COUNT(*) +1 +SELECT COUNT(*) FROM t WHERE MATCH t AGAINST (REPEAT('befor',17)); +COUNT(*) +0 +SELECT COUNT(*) FROM t WHERE MATCH t AGAINST ('after'); +COUNT(*) +1 +SELECT COUNT(*) FROM t WHERE MATCH t AGAINST (REPEAT('after',17)); +COUNT(*) +0 +SELECT COUNT(*) FROM t WHERE MATCH t AGAINST (REPEAT(_utf8 0xefbc90, 83)); +COUNT(*) +0 +SELECT COUNT(*) FROM t WHERE MATCH t AGAINST (REPEAT(_utf8 0xefbc90, 84)); +COUNT(*) +1 +SELECT COUNT(*) FROM t WHERE MATCH t AGAINST (REPEAT(_utf8 0xefbc90, 85)); +COUNT(*) +0 +SELECT COUNT(*) FROM t WHERE MATCH t AGAINST (REPEAT(_utf8 0xefbc91, 83)); +COUNT(*) +0 +SELECT COUNT(*) FROM t WHERE MATCH t AGAINST (REPEAT(_utf8 0xefbc91, 84)); +COUNT(*) +1 +SELECT COUNT(*) FROM t WHERE MATCH t AGAINST (REPEAT(_utf8 0xefbc91, 85)); +COUNT(*) +0 +SELECT COUNT(*) FROM t WHERE MATCH t AGAINST (REPEAT(_utf8mb4 0xf09f9695, 83)); +COUNT(*) +0 +SELECT COUNT(*) FROM t WHERE MATCH t AGAINST (REPEAT(_utf8mb4 0xf09f9695, 84)); +COUNT(*) +0 +SELECT COUNT(*) FROM t WHERE MATCH t AGAINST (REPEAT(_utf8mb4 0xf09f9696, 84)); +COUNT(*) +0 +SELECT COUNT(*) FROM t WHERE MATCH t AGAINST (REPEAT(_utf8mb4 0xf09f9696, 85)); +COUNT(*) +0 +SELECT * FROM t; +t +甲骨文甲骨文甲骨文甲骨文甲骨文甲骨文甲骨文甲骨文甲骨文甲骨文甲骨文甲骨文甲骨文甲骨文甲骨文 +000000000000000000000000000000000000000000000000000000000000000000000000000000000000 +beforbeforbeforbeforbeforbeforbeforbeforbeforbeforbeforbeforbeforbeforbeforbeforbefor +BeforeTheIndexCreation +this was inserted after creating the index +111111111111111111111111111111111111111111111111111111111111111111111111111111111111 +afterafterafterafterafterafterafterafterafterafterafterafterafterafterafterafterafter +็็็็้้้้้็็็็้้้้้็็็็้้้้้็็็็้้้้้็็็็้้้้้ +🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕 +🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖 +SELECT len,COUNT(*) FROM INFORMATION_SCHEMA.INNODB_SYS_COLUMNS where name='word' GROUP BY len; +len COUNT(*) +336 6 +DROP TABLE t; +CREATE TABLE t(t TEXT CHARACTER SET latin1, FULLTEXT INDEX(t)) +ENGINE=InnoDB; +SELECT len,COUNT(*) FROM INFORMATION_SCHEMA.INNODB_SYS_COLUMNS where name='word' GROUP BY len; +len COUNT(*) +84 6 +DROP TABLE t; diff --git a/mysql-test/suite/innodb_fts/t/create.opt b/mysql-test/suite/innodb_fts/t/create.opt new file mode 100644 index 00000000000..3ad568c816e --- /dev/null +++ b/mysql-test/suite/innodb_fts/t/create.opt @@ -0,0 +1 @@ +--loose-innodb-sys-columns diff --git a/mysql-test/suite/innodb_fts/t/create.test b/mysql-test/suite/innodb_fts/t/create.test new file mode 100644 index 00000000000..f0329602ed1 --- /dev/null +++ b/mysql-test/suite/innodb_fts/t/create.test @@ -0,0 +1,92 @@ +--source include/have_innodb.inc +SET NAMES utf8mb4; + +--echo # +--echo # MDEV-11233 CREATE FULLTEXT INDEX with a token +--echo # longer than 127 bytes crashes server +--echo # + +# This bug is the result of merging the Oracle MySQL follow-up fix +# BUG#22963169 MYSQL CRASHES ON CREATE FULLTEXT INDEX +# without merging a fix of Bug#79475 Insert a token of 84 4-bytes +# chars into fts index causes server crash. + +# Oracle did not publish tests for either of the above MySQL bugs. +# The tests below were developed for MariaDB Server. +# The maximum length of a fulltext-indexed word is 84 characters. + +CREATE TABLE t(t TEXT CHARACTER SET utf8mb3) ENGINE=InnoDB; +INSERT INTO t SET t=REPEAT(CONCAT(REPEAT(_utf8mb3 0xE0B987, 4), REPEAT(_utf8mb3 0xE0B989, 5)), 5); +INSERT INTO t SET t=REPEAT(_utf8 0xefbc90,84); +INSERT INTO t SET t=REPEAT('befor',17); # too long, will not be indexed +INSERT INTO t SET t='BeforeTheIndexCreation'; +CREATE FULLTEXT INDEX ft ON t(t); +INSERT INTO t SET t='this was inserted after creating the index'; +INSERT INTO t SET t=REPEAT(_utf8 0xefbc91,84); +INSERT INTO t SET t=REPEAT('after',17); # too long, will not be indexed +INSERT INTO t SET t=REPEAT(_utf8mb3 0xe794b2e9aaa8e69687, 15); +--echo # The data below is not 3-byte UTF-8, but 4-byte chars. +INSERT INTO t SET t=REPEAT(_utf8mb4 0xf09f9695, 84); +INSERT INTO t SET t=REPEAT(_utf8mb4 0xf09f9696, 85); +SELECT COUNT(*) FROM t WHERE MATCH t AGAINST +(REPEAT(CONCAT(REPEAT(_utf8mb3 0xE0B987, 4), REPEAT(_utf8mb3 0xE0B989, 5)), 5)); +SELECT COUNT(*) FROM t WHERE MATCH t AGAINST ('BeforeTheIndexCreation'); +SELECT COUNT(*) FROM t WHERE MATCH t AGAINST (REPEAT('befor',17)); +SELECT COUNT(*) FROM t WHERE MATCH t AGAINST ('after'); +SELECT COUNT(*) FROM t WHERE MATCH t AGAINST (REPEAT('after',17)); +SELECT COUNT(*) FROM t WHERE MATCH t AGAINST (REPEAT(_utf8 0xefbc90, 83)); +SELECT COUNT(*) FROM t WHERE MATCH t AGAINST (REPEAT(_utf8 0xefbc90, 84)); +SELECT COUNT(*) FROM t WHERE MATCH t AGAINST (REPEAT(_utf8 0xefbc90, 85)); +SELECT COUNT(*) FROM t WHERE MATCH t AGAINST (REPEAT(_utf8 0xefbc91, 83)); +SELECT COUNT(*) FROM t WHERE MATCH t AGAINST (REPEAT(_utf8 0xefbc91, 84)); +SELECT COUNT(*) FROM t WHERE MATCH t AGAINST (REPEAT(_utf8 0xefbc91, 85)); +SELECT COUNT(*) FROM t WHERE MATCH t AGAINST (REPEAT(_utf8mb4 0xf09f9695, 83)); +SELECT COUNT(*) FROM t WHERE MATCH t AGAINST (REPEAT(_utf8mb4 0xf09f9695, 84)); +SELECT COUNT(*) FROM t WHERE MATCH t AGAINST (REPEAT(_utf8mb4 0xf09f9696, 84)); +SELECT COUNT(*) FROM t WHERE MATCH t AGAINST (REPEAT(_utf8mb4 0xf09f9696, 85)); +SELECT * FROM t; + +# The column length should be 252 bytes (84 characters * 3 bytes/character). +SELECT len,COUNT(*) FROM INFORMATION_SCHEMA.INNODB_SYS_COLUMNS where name='word' GROUP BY len; +DROP TABLE t; + +CREATE TABLE t(t TEXT CHARACTER SET utf8mb4) ENGINE=InnoDB; +INSERT INTO t SET t=REPEAT(_utf8mb3 0xe794b2e9aaa8e69687, 15); +INSERT INTO t SET t=REPEAT(_utf8 0xefbc90,84); +INSERT INTO t SET t=REPEAT('befor',17); # too long, will not be indexed +INSERT INTO t SET t='BeforeTheIndexCreation'; +CREATE FULLTEXT INDEX ft ON t(t); +INSERT INTO t SET t='this was inserted after creating the index'; +INSERT INTO t SET t=REPEAT(_utf8 0xefbc91,84); +INSERT INTO t SET t=REPEAT('after',17); # too long, will not be indexed +INSERT INTO t SET t=REPEAT(concat(repeat(_utf8mb3 0xE0B987, 4), repeat(_utf8mb3 0xE0B989, 5)), 5); +INSERT INTO t SET t=REPEAT(_utf8mb4 0xf09f9695, 84); +--echo # The token below exceeds the 84-character limit. +INSERT INTO t SET t=REPEAT(_utf8mb4 0xf09f9696, 85); +SELECT COUNT(*) FROM t WHERE MATCH t AGAINST (REPEAT(_utf8mb3 0xe794b2e9aaa8e69687, 15)); +SELECT COUNT(*) FROM t WHERE MATCH t AGAINST ('BeforeTheIndexCreation'); +SELECT COUNT(*) FROM t WHERE MATCH t AGAINST (REPEAT('befor',17)); +SELECT COUNT(*) FROM t WHERE MATCH t AGAINST ('after'); +SELECT COUNT(*) FROM t WHERE MATCH t AGAINST (REPEAT('after',17)); +SELECT COUNT(*) FROM t WHERE MATCH t AGAINST (REPEAT(_utf8 0xefbc90, 83)); +SELECT COUNT(*) FROM t WHERE MATCH t AGAINST (REPEAT(_utf8 0xefbc90, 84)); +SELECT COUNT(*) FROM t WHERE MATCH t AGAINST (REPEAT(_utf8 0xefbc90, 85)); +SELECT COUNT(*) FROM t WHERE MATCH t AGAINST (REPEAT(_utf8 0xefbc91, 83)); +SELECT COUNT(*) FROM t WHERE MATCH t AGAINST (REPEAT(_utf8 0xefbc91, 84)); +SELECT COUNT(*) FROM t WHERE MATCH t AGAINST (REPEAT(_utf8 0xefbc91, 85)); +SELECT COUNT(*) FROM t WHERE MATCH t AGAINST (REPEAT(_utf8mb4 0xf09f9695, 83)); +SELECT COUNT(*) FROM t WHERE MATCH t AGAINST (REPEAT(_utf8mb4 0xf09f9695, 84)); +SELECT COUNT(*) FROM t WHERE MATCH t AGAINST (REPEAT(_utf8mb4 0xf09f9696, 84)); +SELECT COUNT(*) FROM t WHERE MATCH t AGAINST (REPEAT(_utf8mb4 0xf09f9696, 85)); +SELECT * FROM t; + +# The column length should be 336 bytes (84 characters * 4 bytes/character). +SELECT len,COUNT(*) FROM INFORMATION_SCHEMA.INNODB_SYS_COLUMNS where name='word' GROUP BY len; +DROP TABLE t; + +CREATE TABLE t(t TEXT CHARACTER SET latin1, FULLTEXT INDEX(t)) +ENGINE=InnoDB; + +# The column length should be 84 bytes (84 characters * 1 byte/character). +SELECT len,COUNT(*) FROM INFORMATION_SCHEMA.INNODB_SYS_COLUMNS where name='word' GROUP BY len; +DROP TABLE t; diff --git a/storage/innobase/fts/fts0fts.cc b/storage/innobase/fts/fts0fts.cc index 3451607a37e..7d538dbd13a 100644 --- a/storage/innobase/fts/fts0fts.cc +++ b/storage/innobase/fts/fts0fts.cc @@ -1,6 +1,7 @@ /***************************************************************************** Copyright (c) 2011, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2016, MariaDB Corporation. All Rights reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -1992,18 +1993,20 @@ func_exit: return(error); } -/** Creates one FTS auxiliary index table for an FTS index. + +/** Create one FTS auxiliary index table for an FTS index. @param[in,out] trx transaction @param[in] index the index instance @param[in] fts_table fts_table structure -@param[in] heap memory heap +@param[in,out] heap memory heap +@see row_merge_create_fts_sort_index() @return DB_SUCCESS or error code */ static dict_table_t* fts_create_one_index_table( trx_t* trx, const dict_index_t* index, - fts_table_t* fts_table, + const fts_table_t* fts_table, mem_heap_t* heap) { dict_field_t* field; @@ -2027,7 +2030,8 @@ fts_create_one_index_table( charset == &my_charset_latin1 ? DATA_VARCHAR : DATA_VARMYSQL, field->col->prtype, - FTS_INDEX_WORD_LEN); + FTS_MAX_WORD_LEN_IN_CHAR + * DATA_MBMAXLEN(field->col->mbminmaxlen)); dict_mem_table_add_col(new_table, heap, "first_doc_id", DATA_INT, DATA_NOT_NULL | DATA_UNSIGNED, diff --git a/storage/innobase/fts/fts0opt.cc b/storage/innobase/fts/fts0opt.cc index 5989aff83f4..45c71f6f1cd 100644 --- a/storage/innobase/fts/fts0opt.cc +++ b/storage/innobase/fts/fts0opt.cc @@ -1,6 +1,7 @@ /***************************************************************************** Copyright (c) 2007, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2016, MariaDB Corporation. All Rights reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -285,7 +286,7 @@ fts_zip_initialize( zip->last_big_block = 0; zip->word.f_len = 0; - memset(zip->word.f_str, 0, FTS_MAX_WORD_LEN); + *zip->word.f_str = 0; ib_vector_reset(zip->blocks); diff --git a/storage/innobase/handler/ha_innodb.cc b/storage/innobase/handler/ha_innodb.cc index 87941833182..eb817b295af 100644 --- a/storage/innobase/handler/ha_innodb.cc +++ b/storage/innobase/handler/ha_innodb.cc @@ -7786,8 +7786,7 @@ get_innobase_type_from_mysql_type( case MYSQL_TYPE_VARCHAR: /* new >= 5.0.3 true VARCHAR */ if (field->binary()) { return(DATA_BINARY); - } else if (strcmp(field->charset()->name, - "latin1_swedish_ci") == 0) { + } else if (field->charset() == &my_charset_latin1) { return(DATA_VARCHAR); } else { return(DATA_VARMYSQL); @@ -7795,10 +7794,8 @@ get_innobase_type_from_mysql_type( case MYSQL_TYPE_BIT: case MYSQL_TYPE_STRING: if (field->binary()) { - return(DATA_FIXBINARY); - } else if (strcmp(field->charset()->name, - "latin1_swedish_ci") == 0) { + } else if (field->charset() == &my_charset_latin1) { return(DATA_CHAR); } else { return(DATA_MYSQL); diff --git a/storage/innobase/include/fts0fts.h b/storage/innobase/include/fts0fts.h index 2d256472ef6..c6b39bce286 100644 --- a/storage/innobase/include/fts0fts.h +++ b/storage/innobase/include/fts0fts.h @@ -1,6 +1,7 @@ /***************************************************************************** Copyright (c) 2011, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2016, MariaDB Corporation. All Rights reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -103,8 +104,8 @@ those defined in mysql file ft_global.h */ should not exceed FTS_DOC_ID_MAX_STEP */ #define FTS_DOC_ID_MAX_STEP 65535 -/** Maximum possible Fulltext word length */ -#define FTS_MAX_WORD_LEN HA_FT_MAXBYTELEN +/** Maximum possible Fulltext word length in bytes (assuming mbmaxlen=4) */ +#define FTS_MAX_WORD_LEN (HA_FT_MAXCHARLEN * 4) /** Maximum possible Fulltext word length (in characters) */ #define FTS_MAX_WORD_LEN_IN_CHAR HA_FT_MAXCHARLEN @@ -120,7 +121,6 @@ should not exceed FTS_DOC_ID_MAX_STEP */ #define FTS_CONFIG_TABLE_KEY_COL_LEN 50 #define FTS_CONFIG_TABLE_VALUE_COL_LEN 200 -#define FTS_INDEX_WORD_LEN FTS_MAX_WORD_LEN #define FTS_INDEX_FIRST_DOC_ID_LEN 8 #define FTS_INDEX_LAST_DOC_ID_LEN 8 #define FTS_INDEX_DOC_COUNT_LEN 4 diff --git a/storage/innobase/row/row0ftsort.cc b/storage/innobase/row/row0ftsort.cc index 7729d4c5c30..3d459e00506 100644 --- a/storage/innobase/row/row0ftsort.cc +++ b/storage/innobase/row/row0ftsort.cc @@ -63,6 +63,8 @@ tokenized doc string. The index has three "fields": integer value) 3) Word's position in original doc. +@see fts_create_one_index_table() + @return dict_index_t structure for the fts sort index */ dict_index_t* row_merge_create_fts_sort_index( @@ -103,16 +105,12 @@ row_merge_create_fts_sort_index( field->prefix_len = 0; field->col = static_cast( mem_heap_alloc(new_index->heap, sizeof(dict_col_t))); - field->col->len = FTS_MAX_WORD_LEN; - - if (strcmp(charset->name, "latin1_swedish_ci") == 0) { - field->col->mtype = DATA_VARCHAR; - } else { - field->col->mtype = DATA_VARMYSQL; - } - field->col->prtype = idx_field->col->prtype | DATA_NOT_NULL; + field->col->mtype = charset == &my_charset_latin1 + ? DATA_VARCHAR : DATA_VARMYSQL; field->col->mbminmaxlen = idx_field->col->mbminmaxlen; + field->col->len = HA_FT_MAXCHARLEN * DATA_MBMAXLEN(field->col->mbminmaxlen); + field->fixed_len = 0; /* Doc ID */ @@ -487,7 +485,8 @@ row_merge_fts_doc_tokenize_by_parser( /*********************************************************************//** Tokenize incoming text data and add to the sort buffer. -@return TRUE if the record passed, FALSE if out of space */ +@see row_merge_buf_encode() +@return TRUE if the record passed, FALSE if out of space */ static ibool row_merge_fts_doc_tokenize( @@ -495,8 +494,6 @@ row_merge_fts_doc_tokenize( row_merge_buf_t** sort_buf, /*!< in/out: sort buffer */ doc_id_t doc_id, /*!< in: Doc ID */ fts_doc_t* doc, /*!< in: Doc to be tokenized */ - dtype_t* word_dtype, /*!< in: data structure for - word col */ merge_file_t** merge_file, /*!< in/out: merge file */ ibool opt_doc_id_size,/*!< in: whether to use 4 bytes instead of 8 bytes integer to @@ -530,7 +527,8 @@ row_merge_fts_doc_tokenize( while (t_ctx->processed_len < doc->text.f_len) { ulint idx = 0; ib_uint32_t position; - ulint cur_len = 0; + ulint offset = 0; + ulint cur_len; doc_id_t write_doc_id; row_fts_token_t* fts_token = NULL; @@ -620,14 +618,34 @@ row_merge_fts_doc_tokenize( dfield_set_data(field, t_str.f_str, t_str.f_len); len = dfield_get_len(field); - field->type.mtype = word_dtype->mtype; - field->type.prtype = word_dtype->prtype | DATA_NOT_NULL; + dict_col_copy_type(dict_index_get_nth_col(buf->index, 0), &field->type); + field->type.prtype |= DATA_NOT_NULL; + ut_ad(len <= field->type.len); - /* Variable length field, set to max size. */ - field->type.len = FTS_MAX_WORD_LEN; - field->type.mbminmaxlen = word_dtype->mbminmaxlen; + /* For the temporary file, row_merge_buf_encode() uses + 1 byte for representing the number of extra_size bytes. + This number will always be 1, because for this 3-field index + consisting of one variable-size column, extra_size will always + be 1 or 2, which can be encoded in one byte. + + The extra_size is 1 byte if the length of the + variable-length column is less than 128 bytes or the + maximum length is less than 256 bytes. */ + + /* One variable length column, word with its lenght less than + fts_max_token_size, add one extra size and one extra byte. + + Since the max length for FTS token now is larger than 255, + so we will need to signify length byte itself, so only 1 to 128 + bytes can be used for 1 bytes, larger than that 2 bytes. */ + if (len < 128 || field->type.len < 256) { + /* Extra size is one byte. */ + cur_len = 2 + len; + } else { + /* Extra size is two bytes. */ + cur_len = 3 + len; + } - cur_len += len; dfield_dup(field, buf->heap); field++; @@ -683,20 +701,6 @@ row_merge_fts_doc_tokenize( cur_len += len; dfield_dup(field, buf->heap); - /* One variable length column, word with its lenght less than - fts_max_token_size, add one extra size and one extra byte. - - Since the max length for FTS token now is larger than 255, - so we will need to signify length byte itself, so only 1 to 128 - bytes can be used for 1 bytes, larger than that 2 bytes. */ - if (t_str.f_len < 128) { - /* Extra size is one byte. */ - cur_len += 2; - } else { - /* Extra size is two bytes. */ - cur_len += 3; - } - /* Reserve one byte for the end marker of row_merge_block_t and we have reserved ROW_MERGE_RESERVE_SIZE (= 4) for encryption key_version in the beginning of the buffer. */ @@ -793,7 +797,6 @@ fts_parallel_tokenization( mem_heap_t* blob_heap = NULL; fts_doc_t doc; dict_table_t* table = psort_info->psort_common->new_table; - dtype_t word_dtype; dict_field_t* idx_field; fts_tokenize_ctx_t t_ctx; ulint retried = 0; @@ -825,10 +828,6 @@ fts_parallel_tokenization( idx_field = dict_index_get_nth_field( psort_info->psort_common->dup->index, 0); - word_dtype.prtype = idx_field->col->prtype; - word_dtype.mbminmaxlen = idx_field->col->mbminmaxlen; - word_dtype.mtype = (strcmp(doc.charset->name, "latin1_swedish_ci") == 0) - ? DATA_VARCHAR : DATA_VARMYSQL; block = psort_info->merge_block; crypt_block = psort_info->crypt_block; @@ -881,7 +880,6 @@ loop: processed = row_merge_fts_doc_tokenize( buf, doc_item->doc_id, &doc, - &word_dtype, merge_file, psort_info->psort_common->opt_doc_id_size, &t_ctx); diff --git a/storage/xtradb/fts/fts0fts.cc b/storage/xtradb/fts/fts0fts.cc index 5e008b37b8d..ea532c181ae 100644 --- a/storage/xtradb/fts/fts0fts.cc +++ b/storage/xtradb/fts/fts0fts.cc @@ -1,6 +1,7 @@ /***************************************************************************** Copyright (c) 2011, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2016, MariaDB Corporation. All Rights reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -1931,6 +1932,8 @@ func_exit: /*************************************************************//** Wrapper function of fts_create_index_tables_low(), create auxiliary tables for an FTS index + +@see row_merge_create_fts_sort_index() @return: DB_SUCCESS or error code */ static dict_table_t* @@ -1962,13 +1965,12 @@ fts_create_one_index_table( (int)(field->col->prtype & DATA_MYSQL_TYPE_MASK), (uint) dtype_get_charset_coll(field->col->prtype)); - if (strcmp(charset->name, "latin1_swedish_ci") == 0) { - dict_mem_table_add_col(new_table, heap, "word", DATA_VARCHAR, - field->col->prtype, FTS_MAX_WORD_LEN); - } else { - dict_mem_table_add_col(new_table, heap, "word", DATA_VARMYSQL, - field->col->prtype, FTS_MAX_WORD_LEN); - } + dict_mem_table_add_col(new_table, heap, "word", + charset == &my_charset_latin1 + ? DATA_VARCHAR : DATA_VARMYSQL, + field->col->prtype, + FTS_MAX_WORD_LEN_IN_CHAR + * DATA_MBMAXLEN(field->col->mbminmaxlen)); dict_mem_table_add_col(new_table, heap, "first_doc_id", DATA_INT, DATA_NOT_NULL | DATA_UNSIGNED, diff --git a/storage/xtradb/fts/fts0opt.cc b/storage/xtradb/fts/fts0opt.cc index d9f2532578e..84f0563e038 100644 --- a/storage/xtradb/fts/fts0opt.cc +++ b/storage/xtradb/fts/fts0opt.cc @@ -1,6 +1,7 @@ /***************************************************************************** Copyright (c) 2007, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2016, MariaDB Corporation. All Rights reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -281,7 +282,7 @@ fts_zip_initialize( zip->last_big_block = 0; zip->word.f_len = 0; - memset(zip->word.f_str, 0, FTS_MAX_WORD_LEN); + *zip->word.f_str = 0; ib_vector_reset(zip->blocks); diff --git a/storage/xtradb/handler/ha_innodb.cc b/storage/xtradb/handler/ha_innodb.cc index 5065bdd8e43..6d956f5574b 100644 --- a/storage/xtradb/handler/ha_innodb.cc +++ b/storage/xtradb/handler/ha_innodb.cc @@ -4300,7 +4300,6 @@ innobase_change_buffering_inited_ok: and consequently we do not need to know the ordering internally in InnoDB. */ - ut_a(0 == strcmp(my_charset_latin1.name, "latin1_swedish_ci")); srv_latin1_ordering = my_charset_latin1.sort_order; innobase_commit_concurrency_init_default(); @@ -7254,18 +7253,16 @@ get_innobase_type_from_mysql_type( case MYSQL_TYPE_VARCHAR: /* new >= 5.0.3 true VARCHAR */ if (field->binary()) { return(DATA_BINARY); - } else if (strcmp(field->charset()->name, - "latin1_swedish_ci") == 0) { + } else if (field->charset() == &my_charset_latin1) { return(DATA_VARCHAR); } else { return(DATA_VARMYSQL); } case MYSQL_TYPE_BIT: - case MYSQL_TYPE_STRING: if (field->binary()) { - + case MYSQL_TYPE_STRING: + if (field->binary()) { return(DATA_FIXBINARY); - } else if (strcmp(field->charset()->name, - "latin1_swedish_ci") == 0) { + } else if (field->charset() == &my_charset_latin1) { return(DATA_CHAR); } else { return(DATA_MYSQL); diff --git a/storage/xtradb/include/fts0fts.h b/storage/xtradb/include/fts0fts.h index 68d4d333245..1b9b1250e9a 100644 --- a/storage/xtradb/include/fts0fts.h +++ b/storage/xtradb/include/fts0fts.h @@ -1,6 +1,7 @@ /***************************************************************************** Copyright (c) 2011, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2016, MariaDB Corporation. All Rights reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -366,8 +367,8 @@ extern ulong fts_min_token_size; need a sync to free some memory */ extern bool fts_need_sync; -/** Maximum possible Fulltext word length */ -#define FTS_MAX_WORD_LEN HA_FT_MAXBYTELEN +/** Maximum possible Fulltext word length in bytes (assuming mbmaxlen=4) */ +#define FTS_MAX_WORD_LEN (HA_FT_MAXCHARLEN * 4) /** Maximum possible Fulltext word length (in characters) */ #define FTS_MAX_WORD_LEN_IN_CHAR HA_FT_MAXCHARLEN diff --git a/storage/xtradb/row/row0ftsort.cc b/storage/xtradb/row/row0ftsort.cc index a3a00620a6d..032b0badcbd 100644 --- a/storage/xtradb/row/row0ftsort.cc +++ b/storage/xtradb/row/row0ftsort.cc @@ -60,6 +60,8 @@ tokenized doc string. The index has three "fields": integer value) 3) Word's position in original doc. +@see fts_create_one_index_table() + @return dict_index_t structure for the fts sort index */ UNIV_INTERN dict_index_t* @@ -101,16 +103,12 @@ row_merge_create_fts_sort_index( field->prefix_len = 0; field->col = static_cast( mem_heap_alloc(new_index->heap, sizeof(dict_col_t))); - field->col->len = FTS_MAX_WORD_LEN; - - if (strcmp(charset->name, "latin1_swedish_ci") == 0) { - field->col->mtype = DATA_VARCHAR; - } else { - field->col->mtype = DATA_VARMYSQL; - } - field->col->prtype = idx_field->col->prtype | DATA_NOT_NULL; + field->col->mtype = charset == &my_charset_latin1 + ? DATA_VARCHAR : DATA_VARMYSQL; field->col->mbminmaxlen = idx_field->col->mbminmaxlen; + field->col->len = HA_FT_MAXCHARLEN * DATA_MBMAXLEN(field->col->mbminmaxlen); + field->fixed_len = 0; /* Doc ID */ @@ -407,6 +405,7 @@ row_fts_free_pll_merge_buf( /*********************************************************************//** Tokenize incoming text data and add to the sort buffer. +@see row_merge_buf_encode() @return TRUE if the record passed, FALSE if out of space */ static ibool @@ -415,8 +414,6 @@ row_merge_fts_doc_tokenize( row_merge_buf_t** sort_buf, /*!< in/out: sort buffer */ doc_id_t doc_id, /*!< in: Doc ID */ fts_doc_t* doc, /*!< in: Doc to be tokenized */ - dtype_t* word_dtype, /*!< in: data structure for - word col */ merge_file_t** merge_file, /*!< in/out: merge file */ ibool opt_doc_id_size,/*!< in: whether to use 4 bytes instead of 8 bytes integer to @@ -448,7 +445,7 @@ row_merge_fts_doc_tokenize( ulint idx = 0; ib_uint32_t position; ulint offset = 0; - ulint cur_len = 0; + ulint cur_len; doc_id_t write_doc_id; inc = innobase_mysql_fts_get_token( @@ -502,14 +499,34 @@ row_merge_fts_doc_tokenize( dfield_set_data(field, t_str.f_str, t_str.f_len); len = dfield_get_len(field); - field->type.mtype = word_dtype->mtype; - field->type.prtype = word_dtype->prtype | DATA_NOT_NULL; + dict_col_copy_type(dict_index_get_nth_col(buf->index, 0), &field->type); + field->type.prtype |= DATA_NOT_NULL; + ut_ad(len <= field->type.len); - /* Variable length field, set to max size. */ - field->type.len = FTS_MAX_WORD_LEN; - field->type.mbminmaxlen = word_dtype->mbminmaxlen; + /* For the temporary file, row_merge_buf_encode() uses + 1 byte for representing the number of extra_size bytes. + This number will always be 1, because for this 3-field index + consisting of one variable-size column, extra_size will always + be 1 or 2, which can be encoded in one byte. + + The extra_size is 1 byte if the length of the + variable-length column is less than 128 bytes or the + maximum length is less than 256 bytes. */ + + /* One variable length column, word with its lenght less than + fts_max_token_size, add one extra size and one extra byte. + + Since the max length for FTS token now is larger than 255, + so we will need to signify length byte itself, so only 1 to 128 + bytes can be used for 1 bytes, larger than that 2 bytes. */ + if (len < 128 || field->type.len < 256) { + /* Extra size is one byte. */ + cur_len = 2 + len; + } else { + /* Extra size is two bytes. */ + cur_len = 3 + len; + } - cur_len += len; dfield_dup(field, buf->heap); field++; @@ -559,20 +576,6 @@ row_merge_fts_doc_tokenize( cur_len += len; dfield_dup(field, buf->heap); - /* One variable length column, word with its lenght less than - fts_max_token_size, add one extra size and one extra byte. - - Since the max length for FTS token now is larger than 255, - so we will need to signify length byte itself, so only 1 to 128 - bytes can be used for 1 bytes, larger than that 2 bytes. */ - if (t_str.f_len < 128) { - /* Extra size is one byte. */ - cur_len += 2; - } else { - /* Extra size is two bytes. */ - cur_len += 3; - } - /* Reserve one byte for the end marker of row_merge_block_t and we have reserved ROW_MERGE_RESERVE_SIZE (= 4) for encryption key_version in the beginning of the buffer. */ @@ -668,7 +671,6 @@ fts_parallel_tokenization( mem_heap_t* blob_heap = NULL; fts_doc_t doc; dict_table_t* table = psort_info->psort_common->new_table; - dtype_t word_dtype; dict_field_t* idx_field; fts_tokenize_ctx_t t_ctx; ulint retried = 0; @@ -694,10 +696,6 @@ fts_parallel_tokenization( idx_field = dict_index_get_nth_field( psort_info->psort_common->dup->index, 0); - word_dtype.prtype = idx_field->col->prtype; - word_dtype.mbminmaxlen = idx_field->col->mbminmaxlen; - word_dtype.mtype = (strcmp(doc.charset->name, "latin1_swedish_ci") == 0) - ? DATA_VARCHAR : DATA_VARMYSQL; block = psort_info->merge_block; crypt_block = psort_info->crypt_block; @@ -750,7 +748,6 @@ loop: processed = row_merge_fts_doc_tokenize( buf, doc_item->doc_id, &doc, - &word_dtype, merge_file, psort_info->psort_common->opt_doc_id_size, &t_ctx);