From 9e22cae1cfcb4fce7a6469e9b136d599efe6b87c Mon Sep 17 00:00:00 2001 From: Sergei Golubchik Date: Wed, 23 May 2018 23:30:14 +0200 Subject: [PATCH 1/8] embedded use-after-free ASAN error Close MYSQL (and destroy THD) in the same thread where it was used, because THD embeds MDL_context, that owns some LF_PINS, that remember a pointer to my_thread_var->stack_ends_here. --- client/mysqltest.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/client/mysqltest.cc b/client/mysqltest.cc index 5ab84323a76..620a3417ac3 100644 --- a/client/mysqltest.cc +++ b/client/mysqltest.cc @@ -903,6 +903,8 @@ pthread_handler_t connection_thread(void *arg) end_thread: cn->query_done= 1; + mysql_close(cn->mysql); + cn->mysql= 0; mysql_thread_end(); pthread_exit(0); return 0; From 29dbb23fb7ac3b10e70e4c5f99dcedab91af85ba Mon Sep 17 00:00:00 2001 From: Monty Date: Thu, 24 May 2018 15:18:09 +0300 Subject: [PATCH 2/8] MDEV-16093 Memory leak with triggers Problem was that blob memory allocated in Table_trigger_list was not properly freed --- mysql-test/r/trigger.result | 13 ++++++++++++- mysql-test/t/trigger.test | 14 +++++++++++++- sql/sql_trigger.cc | 6 ++++++ 3 files changed, 31 insertions(+), 2 deletions(-) diff --git a/mysql-test/r/trigger.result b/mysql-test/r/trigger.result index 8455450e294..8852a622251 100644 --- a/mysql-test/r/trigger.result +++ b/mysql-test/r/trigger.result @@ -2306,4 +2306,15 @@ CREATE TABLE t1 (i INT); insert into t2 value (2); DROP VIEW v1; DROP TABLE t1,t2,t3; -End of 10.1 tests. +# +# MDEV-16093 +# Assertion `global_status_var.global_memory_used == 0' failed or +# bytes lost after inserting into table with non-null blob and trigger +# +CREATE TABLE t1 (b BLOB NOT NULL); +CREATE TRIGGER tr BEFORE UPDATE ON t1 FOR EACH ROW BEGIN END; +INSERT INTO t1 VALUES ('foo'); +DROP TABLE t1; +# +# End of 10.1 tests. +# diff --git a/mysql-test/t/trigger.test b/mysql-test/t/trigger.test index ff6f38b719d..a6d4107e591 100644 --- a/mysql-test/t/trigger.test +++ b/mysql-test/t/trigger.test @@ -2656,5 +2656,17 @@ insert into t2 value (2); DROP VIEW v1; DROP TABLE t1,t2,t3; +--echo # +--echo # MDEV-16093 +--echo # Assertion `global_status_var.global_memory_used == 0' failed or +--echo # bytes lost after inserting into table with non-null blob and trigger +--echo # ---echo End of 10.1 tests. +CREATE TABLE t1 (b BLOB NOT NULL); +CREATE TRIGGER tr BEFORE UPDATE ON t1 FOR EACH ROW BEGIN END; +INSERT INTO t1 VALUES ('foo'); +DROP TABLE t1; + +--echo # +--echo # End of 10.1 tests. +--echo # diff --git a/sql/sql_trigger.cc b/sql/sql_trigger.cc index 293a4c17156..bbcc75718a3 100644 --- a/sql/sql_trigger.cc +++ b/sql/sql_trigger.cc @@ -614,6 +614,7 @@ end: #endif /* WITH_WSREP */ } + /** Build stmt_query to write it in the bin-log and get the trigger definer. @@ -1061,6 +1062,11 @@ Table_triggers_list::~Table_triggers_list() for (int j= 0; j < (int)TRG_ACTION_MAX; j++) delete bodies[i][j]; + /* Free blobs used in insert */ + if (record0_field) + for (Field **fld_ptr= record0_field; *fld_ptr; fld_ptr++) + (*fld_ptr)->free(); + if (record1_field) for (Field **fld_ptr= record1_field; *fld_ptr; fld_ptr++) delete *fld_ptr; From 5a16fe0e6f33f1b123bbe9422126dd3b0fdf5ed1 Mon Sep 17 00:00:00 2001 From: Monty Date: Thu, 24 May 2018 15:19:55 +0300 Subject: [PATCH 3/8] Fixed compiler warnings When merging this with 10.2 and later, one can just use the 10.2 or later code --- storage/connect/ha_connect.cc | 2 +- storage/sphinx/ha_sphinx.cc | 87 +++++++++++++++++------------------ 2 files changed, 43 insertions(+), 46 deletions(-) diff --git a/storage/connect/ha_connect.cc b/storage/connect/ha_connect.cc index e6bfa97f327..af329c0c8df 100644 --- a/storage/connect/ha_connect.cc +++ b/storage/connect/ha_connect.cc @@ -432,7 +432,7 @@ handlerton *connect_hton= NULL; uint GetTraceValue(void) {return (uint)(connect_hton ? THDVAR(current_thd, xtrace) : 0);} bool ExactInfo(void) {return THDVAR(current_thd, exact_info);} -bool CondPushEnabled(void) {return THDVAR(current_thd, cond_push);} +static bool CondPushEnabled(void) {return THDVAR(current_thd, cond_push);} USETEMP UseTemp(void) {return (USETEMP)THDVAR(current_thd, use_tempfile);} int GetConvSize(void) {return THDVAR(current_thd, conv_size);} TYPCONV GetTypeConv(void) {return (TYPCONV)THDVAR(current_thd, type_conv);} diff --git a/storage/sphinx/ha_sphinx.cc b/storage/sphinx/ha_sphinx.cc index 5308f780e0e..3f6770b5d26 100644 --- a/storage/sphinx/ha_sphinx.cc +++ b/storage/sphinx/ha_sphinx.cc @@ -17,7 +17,7 @@ #pragma implementation // gcc: Class implementation #endif -#if _MSC_VER>=1400 +#if defined(_MSC_VER) && _MSC_VER>=1400 #define _CRT_SECURE_NO_DEPRECATE 1 #define _CRT_NONSTDC_NO_DEPRECATE 1 #endif @@ -64,7 +64,7 @@ #define MSG_WAITALL 0 #endif -#if _MSC_VER>=1400 +#if defined(_MSC_VER) && _MSC_VER>=1400 #pragma warning(push,4) #endif @@ -1041,8 +1041,8 @@ static bool ParseUrl ( CSphSEShare * share, TABLE * table, bool bCreate ) bool bOk = true; bool bQL = false; char * sScheme = NULL; - char * sHost = SPHINXAPI_DEFAULT_HOST; - char * sIndex = SPHINXAPI_DEFAULT_INDEX; + char * sHost = (char*) SPHINXAPI_DEFAULT_HOST; + char * sIndex = (char*) SPHINXAPI_DEFAULT_INDEX; int iPort = SPHINXAPI_DEFAULT_PORT; // parse connection string, if any @@ -1068,12 +1068,12 @@ static bool ParseUrl ( CSphSEShare * share, TABLE * table, bool bCreate ) sHost--; // reuse last slash iPort = 0; if (!( sIndex = strrchr ( sHost, ':' ) )) - sIndex = SPHINXAPI_DEFAULT_INDEX; + sIndex = (char*) SPHINXAPI_DEFAULT_INDEX; else { *sIndex++ = '\0'; if ( !*sIndex ) - sIndex = SPHINXAPI_DEFAULT_INDEX; + sIndex = (char*) SPHINXAPI_DEFAULT_INDEX; } bOk = true; break; @@ -1095,7 +1095,7 @@ static bool ParseUrl ( CSphSEShare * share, TABLE * table, bool bCreate ) if ( sIndex ) *sIndex++ = '\0'; else - sIndex = SPHINXAPI_DEFAULT_INDEX; + sIndex = (char*) SPHINXAPI_DEFAULT_INDEX; iPort = atoi(sPort); if ( !iPort ) @@ -1107,7 +1107,7 @@ static bool ParseUrl ( CSphSEShare * share, TABLE * table, bool bCreate ) if ( sIndex ) *sIndex++ = '\0'; else - sIndex = SPHINXAPI_DEFAULT_INDEX; + sIndex = (char*) SPHINXAPI_DEFAULT_INDEX; } bOk = true; break; @@ -1303,8 +1303,8 @@ CSphSEQuery::CSphSEQuery ( const char * sQuery, int iLength, const char * sIndex , m_sGeoLongAttr ( "" ) , m_fGeoLatitude ( 0.0f ) , m_fGeoLongitude ( 0.0f ) - , m_sComment ( "" ) - , m_sSelect ( "*" ) + , m_sComment ( (char*) "" ) + , m_sSelect ( (char*) "*" ) , m_pBuf ( NULL ) , m_pCur ( NULL ) @@ -1738,7 +1738,7 @@ bool CSphSEQuery::ParseField ( char * sField ) } } else if ( !strcmp ( sName, "override" ) ) // name,type,id:value,id:value,... { - char * sName = NULL; + sName = NULL; int iType = 0; CSphSEQuery::Override_t * pOverride = NULL; @@ -1794,7 +1794,7 @@ bool CSphSEQuery::ParseField ( char * sField ) *sRest++ = '\0'; if (!( sRest - sId )) break; - char * sValue = sRest; + sValue = sRest; if ( ( sRest = strchr ( sRest, ',' ) )!=NULL ) *sRest++ = '\0'; if ( !*sValue ) @@ -2213,7 +2213,7 @@ int ha_sphinx::Connect ( const char * sHost, ushort uPort ) } char sError[512]; - int iSocket = socket ( iDomain, SOCK_STREAM, 0 ); + int iSocket = (int) socket ( iDomain, SOCK_STREAM, 0 ); if ( iSocket<0 ) { @@ -2538,12 +2538,6 @@ char * ha_sphinx::UnpackString () } -static inline const char * FixNull ( const char * s ) -{ - return s ? s : "(null)"; -} - - bool ha_sphinx::UnpackSchema () { SPH_ENTER_METHOD(); @@ -2674,7 +2668,7 @@ bool ha_sphinx::UnpackStats ( CSphSEStats * pStats ) assert ( pStats ); char * pCurSave = m_pCur; - for ( uint i=0; istore ( sBuf, pCur-sBuf, &my_charset_bin ); + af->store ( sBuf, uint(pCur-sBuf), &my_charset_bin ); } break; @@ -3386,39 +3380,39 @@ ha_rows ha_sphinx::records_in_range ( uint, key_range *, key_range * ) // currently provided for doing that. // // Called from handle.cc by ha_create_table(). -int ha_sphinx::create ( const char * name, TABLE * table, HA_CREATE_INFO * ) +int ha_sphinx::create ( const char * name, TABLE * table_arg, HA_CREATE_INFO * ) { SPH_ENTER_METHOD(); char sError[256]; CSphSEShare tInfo; - if ( !ParseUrl ( &tInfo, table, true ) ) + if ( !ParseUrl ( &tInfo, table_arg, true ) ) SPH_RET(-1); // check SphinxAPI table for ( ; !tInfo.m_bSphinxQL; ) { // check system fields (count and types) - if ( table->s->fieldss->fieldsfield[0] ) ) + if ( !IsIDField ( table_arg->field[0] ) ) { my_snprintf ( sError, sizeof(sError), "%s: 1st column (docid) MUST be unsigned integer or bigint", name ); break; } - if ( !IsIntegerFieldType ( table->field[1]->type() ) ) + if ( !IsIntegerFieldType ( table_arg->field[1]->type() ) ) { my_snprintf ( sError, sizeof(sError), "%s: 2nd column (weight) MUST be integer or bigint", name ); break; } - enum_field_types f2 = table->field[2]->type(); + enum_field_types f2 = table_arg->field[2]->type(); if ( f2!=MYSQL_TYPE_VARCHAR && f2!=MYSQL_TYPE_BLOB && f2!=MYSQL_TYPE_MEDIUM_BLOB && f2!=MYSQL_TYPE_LONG_BLOB && f2!=MYSQL_TYPE_TINY_BLOB ) { @@ -3428,28 +3422,28 @@ int ha_sphinx::create ( const char * name, TABLE * table, HA_CREATE_INFO * ) // check attributes int i; - for ( i=3; i<(int)table->s->fields; i++ ) + for ( i=3; i<(int)table_arg->s->fields; i++ ) { - enum_field_types eType = table->field[i]->type(); + enum_field_types eType = table_arg->field[i]->type(); if ( eType!=MYSQL_TYPE_TIMESTAMP && !IsIntegerFieldType(eType) && eType!=MYSQL_TYPE_VARCHAR && eType!=MYSQL_TYPE_FLOAT ) { my_snprintf ( sError, sizeof(sError), "%s: %dth column (attribute %s) MUST be integer, bigint, timestamp, varchar, or float", - name, i+1, table->field[i]->field_name ); + name, i+1, table_arg->field[i]->field_name ); break; } } - if ( i!=(int)table->s->fields ) + if ( i!=(int)table_arg->s->fields ) break; // check index if ( - table->s->keys!=1 || - table->key_info[0].user_defined_key_parts!=1 || - strcasecmp ( table->key_info[0].key_part[0].field->field_name, table->field[2]->field_name ) ) + table_arg->s->keys!=1 || + table_arg->key_info[0].user_defined_key_parts!=1 || + strcasecmp ( table_arg->key_info[0].key_part[0].field->field_name, table->field[2]->field_name ) ) { my_snprintf ( sError, sizeof(sError), "%s: there must be an index on '%s' column", - name, table->field[2]->field_name ); + name, table_arg->field[2]->field_name ); break; } @@ -3464,13 +3458,13 @@ int ha_sphinx::create ( const char * name, TABLE * table, HA_CREATE_INFO * ) sError[0] = '\0'; // check that 1st column is id, is of int type, and has an index - if ( strcmp ( table->field[0]->field_name, "id" ) ) + if ( strcmp ( table_arg->field[0]->field_name, "id" ) ) { my_snprintf ( sError, sizeof(sError), "%s: 1st column must be called 'id'", name ); break; } - if ( !IsIDField ( table->field[0] ) ) + if ( !IsIDField ( table_arg->field[0] ) ) { my_snprintf ( sError, sizeof(sError), "%s: 'id' column must be INT UNSIGNED or BIGINT", name ); break; @@ -3478,22 +3472,22 @@ int ha_sphinx::create ( const char * name, TABLE * table, HA_CREATE_INFO * ) // check index if ( - table->s->keys!=1 || - table->key_info[0].user_defined_key_parts!=1 || - strcasecmp ( table->key_info[0].key_part[0].field->field_name, "id" ) ) + table_arg->s->keys!=1 || + table_arg->key_info[0].user_defined_key_parts!=1 || + strcasecmp ( table_arg->key_info[0].key_part[0].field->field_name, "id" ) ) { my_snprintf ( sError, sizeof(sError), "%s: 'id' column must be indexed", name ); break; } // check column types - for ( int i=1; i<(int)table->s->fields; i++ ) + for ( int i=1; i<(int)table_arg->s->fields; i++ ) { - enum_field_types eType = table->field[i]->type(); + enum_field_types eType = table_arg->field[i]->type(); if ( eType!=MYSQL_TYPE_TIMESTAMP && !IsIntegerFieldType(eType) && eType!=MYSQL_TYPE_VARCHAR && eType!=MYSQL_TYPE_FLOAT ) { my_snprintf ( sError, sizeof(sError), "%s: column %d(%s) is of unsupported type (use int/bigint/timestamp/varchar/float)", - name, i+1, table->field[i]->field_name ); + name, i+1, table_arg->field[i]->field_name ); break; } } @@ -3507,8 +3501,11 @@ int ha_sphinx::create ( const char * name, TABLE * table, HA_CREATE_INFO * ) // report and bail if ( sError[0] ) { - my_error ( ER_CANT_CREATE_TABLE, MYF(0), - table->s->db.str, table->s->table_name, sError ); + my_printf_error(ER_CANT_CREATE_TABLE, + "Can\'t create table %s.%s (Error: %s)", + MYF(0), + table_arg->s->db.str, + table_arg->s->table_name.str, sError); SPH_RET(-1); } From 199517f501b5d50daf85d3d5620cb391c03fddfe Mon Sep 17 00:00:00 2001 From: Monty Date: Thu, 24 May 2018 19:09:33 +0300 Subject: [PATCH 4/8] Avoid warnings in String::copy when copying string on itself (ok to do) --- sql/sql_string.cc | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/sql/sql_string.cc b/sql/sql_string.cc index 20772adcb22..2c5be35e887 100644 --- a/sql/sql_string.cc +++ b/sql/sql_string.cc @@ -183,7 +183,16 @@ bool String::copy(const char *str,uint32 arg_length, CHARSET_INFO *cs) { if (alloc(arg_length)) return TRUE; - if ((str_length=arg_length)) + if (Ptr == str && arg_length == str_length) + { + /* + This can happen in some cases. This code is here mainly to avoid + warnings from valgrind, but can also be an indication of error. + */ + DBUG_PRINT("warning", ("Copying string on itself: %p %lu", + str, arg_length)); + } + else if ((str_length=arg_length)) memcpy(Ptr,str,arg_length); Ptr[arg_length]=0; str_charset=cs; From d8da920264a0321e6d03b3cbe3c3b414f622aefa Mon Sep 17 00:00:00 2001 From: Monty Date: Fri, 25 May 2018 11:51:43 +0300 Subject: [PATCH 5/8] MDEV-10679 Crash in performance schema and partitioning with discovery Crash happened because in discover, table->work_part_info was not properly reset before execution. Fixed by resetting before calling execute alter table, create table or mysql_create_frm_image. --- mysql-test/suite/perfschema/r/partition.result | 10 ++++++++++ mysql-test/suite/perfschema/t/partition.test | 15 +++++++++++++++ sql/sql_alter.cc | 5 +++-- sql/sql_parse.cc | 5 +---- sql/sql_table.cc | 3 +++ sql/table.cc | 1 + 6 files changed, 33 insertions(+), 6 deletions(-) create mode 100644 mysql-test/suite/perfschema/r/partition.result create mode 100644 mysql-test/suite/perfschema/t/partition.test diff --git a/mysql-test/suite/perfschema/r/partition.result b/mysql-test/suite/perfschema/r/partition.result new file mode 100644 index 00000000000..9bc624268bb --- /dev/null +++ b/mysql-test/suite/perfschema/r/partition.result @@ -0,0 +1,10 @@ +# +# MDEV-10679 +# Server crashes in in mysql_create_frm_image upon query from +# performance schema in ps-protocol mode +# +CREATE TABLE t1 (i INT); +ALTER TABLE t1 ADD PARTITION (PARTITION p VALUES LESS THAN (1)); +ERROR HY000: Partition management on a not partitioned table is not possible +SELECT * FROM performance_schema.events_stages_summary_by_user_by_event_name; +DROP TABLE t1; diff --git a/mysql-test/suite/perfschema/t/partition.test b/mysql-test/suite/perfschema/t/partition.test new file mode 100644 index 00000000000..073a41e9252 --- /dev/null +++ b/mysql-test/suite/perfschema/t/partition.test @@ -0,0 +1,15 @@ +--source include/have_perfschema.inc + +--echo # +--echo # MDEV-10679 +--echo # Server crashes in in mysql_create_frm_image upon query from +--echo # performance schema in ps-protocol mode +--echo # + +CREATE TABLE t1 (i INT); +--error ER_PARTITION_MGMT_ON_NONPARTITIONED +ALTER TABLE t1 ADD PARTITION (PARTITION p VALUES LESS THAN (1)); +--disable_result_log +SELECT * FROM performance_schema.events_stages_summary_by_user_by_event_name; +--enable_result_log +DROP TABLE t1; diff --git a/sql/sql_alter.cc b/sql/sql_alter.cc index bff45e089a4..6f21fb4b931 100644 --- a/sql/sql_alter.cc +++ b/sql/sql_alter.cc @@ -262,8 +262,8 @@ bool Sql_cmd_alter_table::execute(THD *thd) - For temporary MERGE tables we do not track if their child tables are base or temporary. As result we can't guarantee that privilege check - which was done in presence of temporary child will stay relevant later - as this temporary table might be removed. + which was done in presence of temporary child will stay relevant + later as this temporary table might be removed. If SELECT_ACL | UPDATE_ACL | DELETE_ACL privileges were not checked for the underlying *base* tables, it would create a security breach as in @@ -303,6 +303,7 @@ bool Sql_cmd_alter_table::execute(THD *thd) create_info.data_file_name= create_info.index_file_name= NULL; thd->enable_slow_log= opt_log_slow_admin_statements; + thd->work_part_info= 0; #ifdef WITH_WSREP TABLE *find_temporary_table(THD *thd, const TABLE_LIST *tl); diff --git a/sql/sql_parse.cc b/sql/sql_parse.cc index 55673ba2713..be7408af77d 100644 --- a/sql/sql_parse.cc +++ b/sql/sql_parse.cc @@ -2475,10 +2475,6 @@ mysql_execute_command(THD *thd) #endif DBUG_ENTER("mysql_execute_command"); -#ifdef WITH_PARTITION_STORAGE_ENGINE - thd->work_part_info= 0; -#endif - DBUG_ASSERT(thd->transaction.stmt.is_empty() || thd->in_sub_stmt); /* Each statement or replication event which might produce deadlock @@ -3318,6 +3314,7 @@ mysql_execute_command(THD *thd) create_info.add(DDL_options_st::OPT_OR_REPLACE_SLAVE_GENERATED); } + thd->work_part_info= 0; #ifdef WITH_PARTITION_STORAGE_ENGINE { partition_info *part_info= thd->lex->part_info; diff --git a/sql/sql_table.cc b/sql/sql_table.cc index 31da585443a..a68f9e626e0 100644 --- a/sql/sql_table.cc +++ b/sql/sql_table.cc @@ -6057,6 +6057,7 @@ remove_key: } } + DBUG_ASSERT(thd->work_part_info == 0); #ifdef WITH_PARTITION_STORAGE_ENGINE partition_info *tab_part_info= table->part_info; thd->work_part_info= thd->lex->part_info; @@ -8411,6 +8412,8 @@ bool mysql_alter_table(THD *thd,char *new_db, char *new_name, { DBUG_ENTER("mysql_alter_table"); + thd->work_part_info= 0; // Used by partitioning + /* Check if we attempt to alter mysql.slow_log or mysql.general_log table and return an error if diff --git a/sql/table.cc b/sql/table.cc index 6770ebdfd4a..d053e9b5670 100644 --- a/sql/table.cc +++ b/sql/table.cc @@ -2233,6 +2233,7 @@ int TABLE_SHARE::init_from_sql_statement_string(THD *thd, bool write, goto ret; thd->lex->create_info.db_type= hton; + thd->work_part_info= 0; // For partitioning if (tabledef_version.str) thd->lex->create_info.tabledef_version= tabledef_version; From 35a9c90fff5e2b8ffc5acc7d7ed051f04c013213 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marko=20M=C3=A4kel=C3=A4?= Date: Tue, 29 May 2018 08:54:33 +0300 Subject: [PATCH 6/8] MDEV-14589 InnoDB should not lock a delete-marked record When the transaction isolation level is SERIALIZABLE, or when a locking read is performed in the REPEATABLE READ isolation level, InnoDB must lock delete-marked records in order to prevent another transaction from inserting something. However, at READ UNCOMMITTED or READ COMMITTED isolation level or when the parameter innodb_locks_unsafe_for_binlog is set, the repeatability of the reads does not matter, and there is no need to lock any records. row_search_for_mysql(): Skip locks on delete-marked committed records upfront, instead of invoking row_unlock_for_mysql() afterwards. The unlocking never worked for secondary index records. --- mysql-test/suite/innodb/r/lock_deleted.result | 40 +++++++++++ mysql-test/suite/innodb/t/lock_deleted.test | 72 +++++++++++++++++++ storage/innobase/row/row0sel.cc | 35 +++++---- storage/xtradb/row/row0sel.cc | 35 +++++---- 4 files changed, 154 insertions(+), 28 deletions(-) create mode 100644 mysql-test/suite/innodb/r/lock_deleted.result create mode 100644 mysql-test/suite/innodb/t/lock_deleted.test diff --git a/mysql-test/suite/innodb/r/lock_deleted.result b/mysql-test/suite/innodb/r/lock_deleted.result new file mode 100644 index 00000000000..d949ac945de --- /dev/null +++ b/mysql-test/suite/innodb/r/lock_deleted.result @@ -0,0 +1,40 @@ +START TRANSACTION WITH CONSISTENT SNAPSHOT; +CREATE TABLE t1(a INT PRIMARY KEY, b INT UNIQUE) ENGINE=InnoDB; +INSERT INTO t1 VALUES(1,1); +DELETE FROM t1; +SET DEBUG_SYNC='row_ins_sec_index_unique SIGNAL inserted WAIT_FOR locked'; +BEGIN; +INSERT INTO t1 VALUES(1,1); +SET DEBUG_SYNC='now WAIT_FOR inserted'; +SET DEBUG_SYNC='innodb_row_search_for_mysql_exit SIGNAL locked'; +SET SESSION TRANSACTION ISOLATION LEVEL READ COMMITTED; +BEGIN; +DELETE FROM t1 WHERE b=1; +COMMIT; +SET DEBUG_SYNC='RESET'; +ROLLBACK; +SET DEBUG_SYNC='row_ins_sec_index_unique SIGNAL inserted WAIT_FOR locked'; +BEGIN; +INSERT INTO t1 VALUES(1,1); +SET DEBUG_SYNC='now WAIT_FOR inserted'; +SET DEBUG_SYNC='innodb_row_search_for_mysql_exit SIGNAL locked'; +SET SESSION TRANSACTION ISOLATION LEVEL READ UNCOMMITTED; +BEGIN; +DELETE FROM t1 WHERE b=1; +COMMIT; +SET DEBUG_SYNC='RESET'; +ROLLBACK; +SET DEBUG_SYNC='row_ins_sec_index_unique SIGNAL inserted WAIT_FOR locked'; +BEGIN; +SET innodb_lock_wait_timeout=1; +INSERT INTO t1 VALUES(1,1); +SET DEBUG_SYNC='now WAIT_FOR inserted'; +SET DEBUG_SYNC='innodb_row_search_for_mysql_exit SIGNAL locked'; +SET SESSION TRANSACTION ISOLATION LEVEL REPEATABLE READ; +BEGIN; +DELETE FROM t1 WHERE b=1; +ERROR HY000: Lock wait timeout exceeded; try restarting transaction +COMMIT; +SET DEBUG_SYNC='RESET'; +COMMIT; +DROP TABLE t1; diff --git a/mysql-test/suite/innodb/t/lock_deleted.test b/mysql-test/suite/innodb/t/lock_deleted.test new file mode 100644 index 00000000000..8dbad90d354 --- /dev/null +++ b/mysql-test/suite/innodb/t/lock_deleted.test @@ -0,0 +1,72 @@ +--source include/have_innodb.inc +--source include/have_debug.inc +--source include/have_debug_sync.inc + +--source include/count_sessions.inc + +connect(stop_purge, localhost, root,,); +START TRANSACTION WITH CONSISTENT SNAPSHOT; +connect(delete, localhost, root,,); +connection default; + +CREATE TABLE t1(a INT PRIMARY KEY, b INT UNIQUE) ENGINE=InnoDB; +INSERT INTO t1 VALUES(1,1); +DELETE FROM t1; + +let $i=2; +while ($i) { +let $iso= `SELECT CASE $i WHEN 1 THEN 'UNCOMMITTED' ELSE 'COMMITTED' END`; + +SET DEBUG_SYNC='row_ins_sec_index_unique SIGNAL inserted WAIT_FOR locked'; +BEGIN; +send INSERT INTO t1 VALUES(1,1); + +connection delete; +SET DEBUG_SYNC='now WAIT_FOR inserted'; +SET DEBUG_SYNC='innodb_row_search_for_mysql_exit SIGNAL locked'; +eval SET SESSION TRANSACTION ISOLATION LEVEL READ $iso; +BEGIN; +send DELETE FROM t1 WHERE b=1; + +connection default; +reap; +connection delete; +reap; +COMMIT; + +connection default; +SET DEBUG_SYNC='RESET'; +ROLLBACK; + +dec $i; +} + +SET DEBUG_SYNC='row_ins_sec_index_unique SIGNAL inserted WAIT_FOR locked'; +BEGIN; +SET innodb_lock_wait_timeout=1; +send INSERT INTO t1 VALUES(1,1); + +connection delete; +SET DEBUG_SYNC='now WAIT_FOR inserted'; +SET DEBUG_SYNC='innodb_row_search_for_mysql_exit SIGNAL locked'; +SET SESSION TRANSACTION ISOLATION LEVEL REPEATABLE READ; +BEGIN; +send DELETE FROM t1 WHERE b=1; + +connection default; +--error ER_LOCK_WAIT_TIMEOUT +reap; +COMMIT; +SET DEBUG_SYNC='RESET'; + +connection delete; +reap; +COMMIT; + +disconnect delete; +disconnect stop_purge; + +connection default; +DROP TABLE t1; + +--source include/wait_until_count_sessions.inc diff --git a/storage/innobase/row/row0sel.cc b/storage/innobase/row/row0sel.cc index 7af788973f2..06bf4cc30c0 100644 --- a/storage/innobase/row/row0sel.cc +++ b/storage/innobase/row/row0sel.cc @@ -4656,9 +4656,27 @@ wrong_offs: ulint lock_type; + if (srv_locks_unsafe_for_binlog + || trx->isolation_level <= TRX_ISO_READ_COMMITTED) { + /* At READ COMMITTED or READ UNCOMMITTED + isolation levels, do not lock committed + delete-marked records. */ + if (!rec_get_deleted_flag(rec, comp)) { + goto no_gap_lock; + } + if (trx_id_t trx_id = index == clust_index + ? row_get_rec_trx_id(rec, index, offsets) + : row_vers_impl_x_locked(rec, index, offsets)) { + if (trx_rw_is_active(trx_id, NULL)) { + /* The record belongs to an active + transaction. We must acquire a lock. */ + goto no_gap_lock; + } + } + goto locks_ok_del_marked; + } + if (!set_also_gap_locks - || srv_locks_unsafe_for_binlog - || trx->isolation_level <= TRX_ISO_READ_COMMITTED || (unique_search && !rec_get_deleted_flag(rec, comp))) { goto no_gap_lock; @@ -4849,20 +4867,9 @@ locks_ok: page_rec_is_comp() cannot be used! */ if (rec_get_deleted_flag(rec, comp)) { - +locks_ok_del_marked: /* The record is delete-marked: we can skip it */ - if ((srv_locks_unsafe_for_binlog - || trx->isolation_level <= TRX_ISO_READ_COMMITTED) - && prebuilt->select_lock_type != LOCK_NONE - && !did_semi_consistent_read) { - - /* No need to keep a lock on a delete-marked record - if we do not want to use next-key locking. */ - - row_unlock_for_mysql(prebuilt, TRUE); - } - /* This is an optimization to skip setting the next key lock on the record that follows this delete-marked record. This optimization works because of the unique search criteria diff --git a/storage/xtradb/row/row0sel.cc b/storage/xtradb/row/row0sel.cc index 97007c1107c..b6b5d107885 100644 --- a/storage/xtradb/row/row0sel.cc +++ b/storage/xtradb/row/row0sel.cc @@ -4665,9 +4665,27 @@ wrong_offs: ulint lock_type; + if (srv_locks_unsafe_for_binlog + || trx->isolation_level <= TRX_ISO_READ_COMMITTED) { + /* At READ COMMITTED or READ UNCOMMITTED + isolation levels, do not lock committed + delete-marked records. */ + if (!rec_get_deleted_flag(rec, comp)) { + goto no_gap_lock; + } + if (trx_id_t trx_id = index == clust_index + ? row_get_rec_trx_id(rec, index, offsets) + : row_vers_impl_x_locked(rec, index, offsets)) { + if (trx_rw_is_active(trx_id, NULL)) { + /* The record belongs to an active + transaction. We must acquire a lock. */ + goto no_gap_lock; + } + } + goto locks_ok_del_marked; + } + if (!set_also_gap_locks - || srv_locks_unsafe_for_binlog - || trx->isolation_level <= TRX_ISO_READ_COMMITTED || (unique_search && !rec_get_deleted_flag(rec, comp))) { goto no_gap_lock; @@ -4862,20 +4880,9 @@ locks_ok: page_rec_is_comp() cannot be used! */ if (rec_get_deleted_flag(rec, comp)) { - +locks_ok_del_marked: /* The record is delete-marked: we can skip it */ - if ((srv_locks_unsafe_for_binlog - || trx->isolation_level <= TRX_ISO_READ_COMMITTED) - && prebuilt->select_lock_type != LOCK_NONE - && !did_semi_consistent_read) { - - /* No need to keep a lock on a delete-marked record - if we do not want to use next-key locking. */ - - row_unlock_for_mysql(prebuilt, TRUE); - } - /* This is an optimization to skip setting the next key lock on the record that follows this delete-marked record. This optimization works because of the unique search criteria From b7985a45a67f5e3537b1d7b5a6830a0ad4267554 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marko=20M=C3=A4kel=C3=A4?= Date: Tue, 29 May 2018 08:55:07 +0300 Subject: [PATCH 7/8] Fix type mismatch --- sql/sql_string.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql/sql_string.cc b/sql/sql_string.cc index 2c5be35e887..c22e33182c6 100644 --- a/sql/sql_string.cc +++ b/sql/sql_string.cc @@ -189,7 +189,7 @@ bool String::copy(const char *str,uint32 arg_length, CHARSET_INFO *cs) This can happen in some cases. This code is here mainly to avoid warnings from valgrind, but can also be an indication of error. */ - DBUG_PRINT("warning", ("Copying string on itself: %p %lu", + DBUG_PRINT("warning", ("Copying string on itself: %p %u", str, arg_length)); } else if ((str_length=arg_length)) From 6aa50bad3947a0eab24fb029cd58f5945439e365 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marko=20M=C3=A4kel=C3=A4?= Date: Tue, 29 May 2018 13:52:43 +0300 Subject: [PATCH 8/8] MDEV-16283 ALTER TABLE...DISCARD TABLESPACE still takes long on a large buffer pool Also fixes MDEV-14727, MDEV-14491 InnoDB: Error: Waited for 5 secs for hash index ref_count (1) to drop to 0 by replacing the flawed wait logic in dict_index_remove_from_cache_low(). On DISCARD TABLESPACE, there is no need to drop the adaptive hash index. We must drop it on IMPORT TABLESPACE, and eventually on DROP TABLE or DROP INDEX. As long as the dict_index_t object remains in the cache and the table remains inaccessible, the adaptive hash index entries to orphaned pages would not do any harm. They would be dropped when buffer pool pages are reused for something else. btr_search_drop_page_hash_when_freed(), buf_LRU_drop_page_hash_batch(): Remove the parameter zip_size, and pass 0 to buf_page_get_gen(). buf_page_get_gen(): Ignore zip_size if mode==BUF_PEEK_IF_IN_POOL. buf_LRU_drop_page_hash_for_tablespace(): Drop the adaptive hash index even if the tablespace is inaccessible. buf_LRU_drop_page_hash_for_tablespace(): New global function, to drop the adaptive hash index. buf_LRU_flush_or_remove_pages(), fil_delete_tablespace(): Remove the parameter drop_ahi. dict_index_remove_from_cache_low(): Actively drop the adaptive hash index if entries exist. This should prevent InnoDB hangs on DROP TABLE or DROP INDEX. row_import_for_mysql(): Drop any adaptive hash index entries for the table. row_drop_table_for_mysql(): Drop any adaptive hash index for the table, except if the table resides in the system tablespace. (DISCARD TABLESPACE does not apply to the system tablespace, and we do no want to drop the adaptive hash index for other tables than the one that is being dropped.) row_truncate_table_for_mysql(): Drop any adaptive hash index entries for the table, except if the table resides in the system tablespace. --- storage/innobase/btr/btr0sea.cc | 15 +++------ storage/innobase/buf/buf0buf.cc | 8 +++-- storage/innobase/buf/buf0lru.cc | 54 ++++++++++++++++-------------- storage/innobase/dict/dict0dict.cc | 32 +++--------------- storage/innobase/fil/fil0fil.cc | 4 +-- storage/innobase/fsp/fsp0fsp.cc | 6 ++-- storage/innobase/include/btr0sea.h | 13 +++---- storage/innobase/include/buf0lru.h | 14 ++++---- storage/innobase/row/row0import.cc | 12 +++++++ storage/innobase/row/row0mysql.cc | 17 ++++++++++ storage/xtradb/btr/btr0sea.cc | 15 +++------ storage/xtradb/buf/buf0buf.cc | 8 +++-- storage/xtradb/buf/buf0lru.cc | 54 ++++++++++++++++-------------- storage/xtradb/dict/dict0dict.cc | 34 +++---------------- storage/xtradb/fil/fil0fil.cc | 4 +-- storage/xtradb/fsp/fsp0fsp.cc | 6 ++-- storage/xtradb/include/btr0sea.h | 13 +++---- storage/xtradb/include/buf0lru.h | 14 ++++---- storage/xtradb/row/row0import.cc | 12 +++++++ storage/xtradb/row/row0mysql.cc | 17 ++++++++++ 20 files changed, 178 insertions(+), 174 deletions(-) diff --git a/storage/innobase/btr/btr0sea.cc b/storage/innobase/btr/btr0sea.cc index e36e6d6194c..bd5cd02aa75 100644 --- a/storage/innobase/btr/btr0sea.cc +++ b/storage/innobase/btr/btr0sea.cc @@ -2,6 +2,7 @@ Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2008, Google Inc. +Copyright (c) 2018, MariaDB Corporation. Portions of this file contain modifications contributed and copyrighted by Google, Inc. Those modifications are gratefully acknowledged and are described @@ -1250,17 +1251,11 @@ cleanup: mem_free(folds); } -/********************************************************************//** -Drops a possible page hash index when a page is evicted from the buffer pool -or freed in a file segment. */ +/** Drop possible adaptive hash index entries when a page is evicted +from the buffer pool or freed in a file, or the index is being dropped. */ UNIV_INTERN void -btr_search_drop_page_hash_when_freed( -/*=================================*/ - ulint space, /*!< in: space id */ - ulint zip_size, /*!< in: compressed page size in bytes - or 0 for uncompressed pages */ - ulint page_no) /*!< in: page number */ +btr_search_drop_page_hash_when_freed(ulint space, ulint page_no) { buf_block_t* block; mtr_t mtr; @@ -1273,7 +1268,7 @@ btr_search_drop_page_hash_when_freed( are possibly holding, we cannot s-latch the page, but must (recursively) x-latch it, even though we are only reading. */ - block = buf_page_get_gen(space, zip_size, page_no, RW_X_LATCH, NULL, + block = buf_page_get_gen(space, 0, page_no, RW_X_LATCH, NULL, BUF_PEEK_IF_IN_POOL, __FILE__, __LINE__, &mtr); diff --git a/storage/innobase/buf/buf0buf.cc b/storage/innobase/buf/buf0buf.cc index 3b160209cf0..50c052fc690 100644 --- a/storage/innobase/buf/buf0buf.cc +++ b/storage/innobase/buf/buf0buf.cc @@ -3075,17 +3075,18 @@ buf_page_get_gen( #ifdef UNIV_DEBUG switch (mode) { case BUF_EVICT_IF_IN_POOL: + case BUF_PEEK_IF_IN_POOL: /* After DISCARD TABLESPACE, the tablespace would not exist, but in IMPORT TABLESPACE, PageConverter::operator() must replace any old pages, which were not evicted during DISCARD. - Skip the assertion on zip_size. */ + Similarly, btr_search_drop_page_hash_when_freed() must + remove any old pages. Skip the assertion on zip_size. */ break; case BUF_GET_NO_LATCH: ut_ad(rw_latch == RW_NO_LATCH); /* fall through */ case BUF_GET: case BUF_GET_IF_IN_POOL: - case BUF_PEEK_IF_IN_POOL: case BUF_GET_IF_IN_POOL_OR_WATCH: case BUF_GET_POSSIBLY_FREED: ut_ad(zip_size == fil_space_get_zip_size(space)); @@ -3257,7 +3258,8 @@ got_block: fix_mutex = buf_page_get_mutex(&fix_block->page); - ut_ad(page_zip_get_size(&block->page.zip) == zip_size); + ut_ad(page_zip_get_size(&block->page.zip) == zip_size + || mode == BUF_PEEK_IF_IN_POOL); switch (mode) { case BUF_GET_IF_IN_POOL: diff --git a/storage/innobase/buf/buf0lru.cc b/storage/innobase/buf/buf0lru.cc index 9e89a291c80..7039ecdf4a6 100644 --- a/storage/innobase/buf/buf0lru.cc +++ b/storage/innobase/buf/buf0lru.cc @@ -241,8 +241,6 @@ void buf_LRU_drop_page_hash_batch( /*=========================*/ ulint space_id, /*!< in: space id */ - ulint zip_size, /*!< in: compressed page size in bytes - or 0 for uncompressed pages */ const ulint* arr, /*!< in: array of page_no */ ulint count) /*!< in: number of entries in array */ { @@ -252,8 +250,7 @@ buf_LRU_drop_page_hash_batch( ut_ad(count <= BUF_LRU_DROP_SEARCH_SIZE); for (i = 0; i < count; ++i) { - btr_search_drop_page_hash_when_freed(space_id, zip_size, - arr[i]); + btr_search_drop_page_hash_when_freed(space_id, arr[i]); } } @@ -272,15 +269,6 @@ buf_LRU_drop_page_hash_for_tablespace( buf_page_t* bpage; ulint* page_arr; ulint num_entries; - ulint zip_size; - - zip_size = fil_space_get_zip_size(id); - - if (UNIV_UNLIKELY(zip_size == ULINT_UNDEFINED)) { - /* Somehow, the tablespace does not exist. Nothing to drop. */ - ut_ad(0); - return; - } page_arr = static_cast(ut_malloc( sizeof(ulint) * BUF_LRU_DROP_SEARCH_SIZE)); @@ -333,8 +321,7 @@ next_page: the latching order. */ buf_pool_mutex_exit(buf_pool); - buf_LRU_drop_page_hash_batch( - id, zip_size, page_arr, num_entries); + buf_LRU_drop_page_hash_batch(id, page_arr, num_entries); num_entries = 0; @@ -365,10 +352,32 @@ next_page: buf_pool_mutex_exit(buf_pool); /* Drop any remaining batch of search hashed pages. */ - buf_LRU_drop_page_hash_batch(id, zip_size, page_arr, num_entries); + buf_LRU_drop_page_hash_batch(id, page_arr, num_entries); ut_free(page_arr); } +/** Drop the adaptive hash index for a tablespace. +@param[in,out] table table */ +UNIV_INTERN void buf_LRU_drop_page_hash_for_tablespace(dict_table_t* table) +{ + for (dict_index_t* index = dict_table_get_first_index(table); + index != NULL; + index = dict_table_get_next_index(index)) { + if (btr_search_info_get_ref_count( + btr_search_get_info(index))) { + goto drop_ahi; + } + } + + return; +drop_ahi: + ulint id = table->space; + for (ulint i = 0; i < srv_buf_pool_instances; i++) { + buf_LRU_drop_page_hash_for_tablespace(buf_pool_from_array(i), + id); + } +} + /******************************************************************//** While flushing (or removing dirty) pages from a tablespace we don't want to hog the CPU and resources. Release the buffer pool and block @@ -675,18 +684,11 @@ buf_flush_dirty_pages(buf_pool_t* buf_pool, ulint id, const trx_t* trx) /** Empty the flush list for all pages belonging to a tablespace. @param[in] id tablespace identifier @param[in] trx transaction, for checking for user interrupt; - or NULL if nothing is to be written -@param[in] drop_ahi whether to drop the adaptive hash index */ -UNIV_INTERN -void -buf_LRU_flush_or_remove_pages(ulint id, const trx_t* trx, bool drop_ahi) + or NULL if nothing is to be written */ +UNIV_INTERN void buf_LRU_flush_or_remove_pages(ulint id, const trx_t* trx) { for (ulint i = 0; i < srv_buf_pool_instances; i++) { - buf_pool_t* buf_pool = buf_pool_from_array(i); - if (drop_ahi) { - buf_LRU_drop_page_hash_for_tablespace(buf_pool, id); - } - buf_flush_dirty_pages(buf_pool, id, trx); + buf_flush_dirty_pages(buf_pool_from_array(i), id, trx); } if (trx && !trx_is_interrupted(trx)) { diff --git a/storage/innobase/dict/dict0dict.cc b/storage/innobase/dict/dict0dict.cc index 623657ef9fe..9609ef96343 100644 --- a/storage/innobase/dict/dict0dict.cc +++ b/storage/innobase/dict/dict0dict.cc @@ -2,7 +2,7 @@ Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2012, Facebook Inc. -Copyright (c) 2013, 2017, MariaDB Corporation. +Copyright (c) 2013, 2018, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -1674,7 +1674,7 @@ dict_table_rename_in_cache( filepath = fil_make_ibd_name(table->name, false); } - fil_delete_tablespace(table->space, true); + fil_delete_tablespace(table->space); /* Delete any temp file hanging around. */ if (os_file_status(filepath, &exists, &ftype) @@ -2719,35 +2719,13 @@ dict_index_remove_from_cache_low( zero. See also: dict_table_can_be_evicted() */ do { - ulint ref_count = btr_search_info_get_ref_count(info); - - if (ref_count == 0) { + if (!btr_search_info_get_ref_count(info)) { break; } - /* Sleep for 10ms before trying again. */ - os_thread_sleep(10000); - ++retries; + buf_LRU_drop_page_hash_for_tablespace(table); - if (retries % 500 == 0) { - /* No luck after 5 seconds of wait. */ - fprintf(stderr, "InnoDB: Error: Waited for" - " %lu secs for hash index" - " ref_count (%lu) to drop" - " to 0.\n" - "index: \"%s\"" - " table: \"%s\"\n", - retries/100, - ref_count, - index->name, - table->name); - } - - /* To avoid a hang here we commit suicide if the - ref_count doesn't drop to zero in 600 seconds. */ - if (retries >= 60000) { - ut_error; - } + ut_a(++retries < 10000); } while (srv_shutdown_state == SRV_SHUTDOWN_NONE || !lru_evict); rw_lock_free(&index->lock); diff --git a/storage/innobase/fil/fil0fil.cc b/storage/innobase/fil/fil0fil.cc index 738afe4ab86..2ba1d90d347 100644 --- a/storage/innobase/fil/fil0fil.cc +++ b/storage/innobase/fil/fil0fil.cc @@ -2891,7 +2891,7 @@ fil_delete_tablespace(ulint id, bool drop_ahi) To deal with potential read requests by checking the ::stop_new_ops flag in fil_io() */ - buf_LRU_flush_or_remove_pages(id, NULL, drop_ahi); + buf_LRU_flush_or_remove_pages(id, NULL); #endif /* !UNIV_HOTBACKUP */ @@ -3002,7 +3002,7 @@ fil_discard_tablespace( { dberr_t err; - switch (err = fil_delete_tablespace(id, true)) { + switch (err = fil_delete_tablespace(id)) { case DB_SUCCESS: break; diff --git a/storage/innobase/fsp/fsp0fsp.cc b/storage/innobase/fsp/fsp0fsp.cc index 1cf37f366d7..b20c59c4d8c 100644 --- a/storage/innobase/fsp/fsp0fsp.cc +++ b/storage/innobase/fsp/fsp0fsp.cc @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2017, MariaDB Corporation. +Copyright (c) 2017, 2018, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -3027,7 +3027,7 @@ fseg_free_page_low( /* Drop search system page hash index if the page is found in the pool and is hashed */ - btr_search_drop_page_hash_when_freed(space, zip_size, page); + btr_search_drop_page_hash_when_freed(space, page); descr = xdes_get_descriptor(space, zip_size, page, mtr); @@ -3247,7 +3247,7 @@ fseg_free_extent( found in the pool and is hashed */ btr_search_drop_page_hash_when_freed( - space, zip_size, first_page_in_extent + i); + space, first_page_in_extent + i); } } diff --git a/storage/innobase/include/btr0sea.h b/storage/innobase/include/btr0sea.h index c95ca28057a..4e1df7066d1 100644 --- a/storage/innobase/include/btr0sea.h +++ b/storage/innobase/include/btr0sea.h @@ -1,6 +1,7 @@ /***************************************************************************** Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2018, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -141,17 +142,11 @@ btr_search_drop_page_hash_index( s- or x-latched, or an index page for which we know that block->buf_fix_count == 0 */ -/********************************************************************//** -Drops a possible page hash index when a page is evicted from the buffer pool -or freed in a file segment. */ +/** Drop possible adaptive hash index entries when a page is evicted +from the buffer pool or freed in a file, or the index is being dropped. */ UNIV_INTERN void -btr_search_drop_page_hash_when_freed( -/*=================================*/ - ulint space, /*!< in: space id */ - ulint zip_size, /*!< in: compressed page size in bytes - or 0 for uncompressed pages */ - ulint page_no); /*!< in: page number */ +btr_search_drop_page_hash_when_freed(ulint space, ulint page_no); /********************************************************************//** Updates the page hash index when a single record is inserted on a page. */ UNIV_INTERN diff --git a/storage/innobase/include/buf0lru.h b/storage/innobase/include/buf0lru.h index 308bda20c7b..623883433c2 100644 --- a/storage/innobase/include/buf0lru.h +++ b/storage/innobase/include/buf0lru.h @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2017, MariaDB Corporation. +Copyright (c) 2017, 2018, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -34,6 +34,7 @@ Created 11/5/1995 Heikki Tuuri // Forward declaration struct trx_t; +struct dict_table_t; /******************************************************************//** Returns TRUE if less than 25 % of the buffer pool is available. This can be @@ -52,14 +53,15 @@ These are low-level functions /** Minimum LRU list length for which the LRU_old pointer is defined */ #define BUF_LRU_OLD_MIN_LEN 512 /* 8 megabytes of 16k pages */ +/** Drop the adaptive hash index for a tablespace. +@param[in,out] table table */ +UNIV_INTERN void buf_LRU_drop_page_hash_for_tablespace(dict_table_t* table); + /** Empty the flush list for all pages belonging to a tablespace. @param[in] id tablespace identifier @param[in] trx transaction, for checking for user interrupt; - or NULL if nothing is to be written -@param[in] drop_ahi whether to drop the adaptive hash index */ -UNIV_INTERN -void -buf_LRU_flush_or_remove_pages(ulint id, const trx_t* trx, bool drop_ahi=false); + or NULL if nothing is to be written */ +UNIV_INTERN void buf_LRU_flush_or_remove_pages(ulint id, const trx_t* trx); #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG /********************************************************************//** diff --git a/storage/innobase/row/row0import.cc b/storage/innobase/row/row0import.cc index a9c24a0f8cc..dfd6b4bfbea 100644 --- a/storage/innobase/row/row0import.cc +++ b/storage/innobase/row/row0import.cc @@ -31,6 +31,7 @@ Created 2012-02-08 by Sunny Bains. #endif #include "btr0pcur.h" +#include "btr0sea.h" #include "que0que.h" #include "dict0boot.h" #include "ibuf0ibuf.h" @@ -3983,6 +3984,17 @@ row_import_for_mysql( return(row_import_cleanup(prebuilt, trx, err)); } + /* On DISCARD TABLESPACE, we did not drop any adaptive hash + index entries. If we replaced the discarded tablespace with a + smaller one here, there could still be some adaptive hash + index entries that point to cached garbage pages in the buffer + pool, because PageConverter::operator() only evicted those + pages that were replaced by the imported pages. We must + discard all remaining adaptive hash index entries, because the + adaptive hash index must be a subset of the table contents; + false positives are not tolerated. */ + buf_LRU_drop_page_hash_for_tablespace(table); + row_mysql_lock_data_dictionary(trx); /* If the table is stored in a remote tablespace, we need to diff --git a/storage/innobase/row/row0mysql.cc b/storage/innobase/row/row0mysql.cc index e34e4fa94ff..be24ae885a2 100644 --- a/storage/innobase/row/row0mysql.cc +++ b/storage/innobase/row/row0mysql.cc @@ -3516,6 +3516,8 @@ row_truncate_table_for_mysql( fil_space_release(space); } + buf_LRU_drop_page_hash_for_tablespace(table); + if (flags != ULINT_UNDEFINED && fil_discard_tablespace(space_id) == DB_SUCCESS) { @@ -4209,6 +4211,21 @@ row_drop_table_for_mysql( rw_lock_x_unlock(dict_index_get_lock(index)); } + if (table->space != TRX_SYS_SPACE) { + /* On DISCARD TABLESPACE, we would not drop the + adaptive hash index entries. If the tablespace is + missing here, delete-marking the record in SYS_INDEXES + would not free any pages in the buffer pool. Thus, + dict_index_remove_from_cache() would hang due to + adaptive hash index entries existing in the buffer + pool. To prevent this hang, and also to guarantee + that btr_search_drop_page_hash_when_freed() will avoid + calling btr_search_drop_page_hash_index() while we + hold the InnoDB dictionary lock, we will drop any + adaptive hash index entries upfront. */ + buf_LRU_drop_page_hash_for_tablespace(table); + } + /* We use the private SQL parser of Innobase to generate the query graphs needed in deleting the dictionary data from system tables in Innobase. Deleting a row from SYS_INDEXES table also diff --git a/storage/xtradb/btr/btr0sea.cc b/storage/xtradb/btr/btr0sea.cc index 12c99246f16..713a584ee7e 100644 --- a/storage/xtradb/btr/btr0sea.cc +++ b/storage/xtradb/btr/btr0sea.cc @@ -2,6 +2,7 @@ Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2008, Google Inc. +Copyright (c) 2018, MariaDB Corporation. Portions of this file contain modifications contributed and copyrighted by Google, Inc. Those modifications are gratefully acknowledged and are described @@ -1299,17 +1300,11 @@ cleanup: mem_free(folds); } -/********************************************************************//** -Drops a possible page hash index when a page is evicted from the buffer pool -or freed in a file segment. */ +/** Drop possible adaptive hash index entries when a page is evicted +from the buffer pool or freed in a file, or the index is being dropped. */ UNIV_INTERN void -btr_search_drop_page_hash_when_freed( -/*=================================*/ - ulint space, /*!< in: space id */ - ulint zip_size, /*!< in: compressed page size in bytes - or 0 for uncompressed pages */ - ulint page_no) /*!< in: page number */ +btr_search_drop_page_hash_when_freed(ulint space, ulint page_no) { buf_block_t* block; mtr_t mtr; @@ -1322,7 +1317,7 @@ btr_search_drop_page_hash_when_freed( are possibly holding, we cannot s-latch the page, but must (recursively) x-latch it, even though we are only reading. */ - block = buf_page_get_gen(space, zip_size, page_no, RW_X_LATCH, NULL, + block = buf_page_get_gen(space, 0, page_no, RW_X_LATCH, NULL, BUF_PEEK_IF_IN_POOL, __FILE__, __LINE__, &mtr); diff --git a/storage/xtradb/buf/buf0buf.cc b/storage/xtradb/buf/buf0buf.cc index 12c5c6efb29..3814d08f19a 100644 --- a/storage/xtradb/buf/buf0buf.cc +++ b/storage/xtradb/buf/buf0buf.cc @@ -2969,17 +2969,18 @@ buf_page_get_gen( #ifdef UNIV_DEBUG switch (mode) { case BUF_EVICT_IF_IN_POOL: + case BUF_PEEK_IF_IN_POOL: /* After DISCARD TABLESPACE, the tablespace would not exist, but in IMPORT TABLESPACE, PageConverter::operator() must replace any old pages, which were not evicted during DISCARD. - Skip the assertion on zip_size. */ + Similarly, btr_search_drop_page_hash_when_freed() must + remove any old pages. Skip the assertion on zip_size. */ break; case BUF_GET_NO_LATCH: ut_ad(rw_latch == RW_NO_LATCH); /* fall through */ case BUF_GET: case BUF_GET_IF_IN_POOL: - case BUF_PEEK_IF_IN_POOL: case BUF_GET_IF_IN_POOL_OR_WATCH: case BUF_GET_POSSIBLY_FREED: ut_ad(zip_size == fil_space_get_zip_size(space)); @@ -3156,7 +3157,8 @@ got_block: fix_mutex = buf_page_get_mutex(&fix_block->page); - ut_ad(page_zip_get_size(&block->page.zip) == zip_size); + ut_ad(page_zip_get_size(&block->page.zip) == zip_size + || mode == BUF_PEEK_IF_IN_POOL); switch (mode) { case BUF_GET_IF_IN_POOL: diff --git a/storage/xtradb/buf/buf0lru.cc b/storage/xtradb/buf/buf0lru.cc index 7bf423ed740..2c4a4049de6 100644 --- a/storage/xtradb/buf/buf0lru.cc +++ b/storage/xtradb/buf/buf0lru.cc @@ -238,8 +238,6 @@ void buf_LRU_drop_page_hash_batch( /*=========================*/ ulint space_id, /*!< in: space id */ - ulint zip_size, /*!< in: compressed page size in bytes - or 0 for uncompressed pages */ const ulint* arr, /*!< in: array of page_no */ ulint count) /*!< in: number of entries in array */ { @@ -249,8 +247,7 @@ buf_LRU_drop_page_hash_batch( ut_ad(count <= BUF_LRU_DROP_SEARCH_SIZE); for (i = 0; i < count; ++i) { - btr_search_drop_page_hash_when_freed(space_id, zip_size, - arr[i]); + btr_search_drop_page_hash_when_freed(space_id, arr[i]); } } @@ -269,15 +266,6 @@ buf_LRU_drop_page_hash_for_tablespace( buf_page_t* bpage; ulint* page_arr; ulint num_entries; - ulint zip_size; - - zip_size = fil_space_get_zip_size(id); - - if (UNIV_UNLIKELY(zip_size == ULINT_UNDEFINED)) { - /* Somehow, the tablespace does not exist. Nothing to drop. */ - ut_ad(0); - return; - } page_arr = static_cast(ut_malloc( sizeof(ulint) * BUF_LRU_DROP_SEARCH_SIZE)); @@ -331,8 +319,7 @@ next_page: the latching order. */ mutex_exit(&buf_pool->LRU_list_mutex); - buf_LRU_drop_page_hash_batch( - id, zip_size, page_arr, num_entries); + buf_LRU_drop_page_hash_batch(id, page_arr, num_entries); num_entries = 0; @@ -363,10 +350,32 @@ next_page: mutex_exit(&buf_pool->LRU_list_mutex); /* Drop any remaining batch of search hashed pages. */ - buf_LRU_drop_page_hash_batch(id, zip_size, page_arr, num_entries); + buf_LRU_drop_page_hash_batch(id, page_arr, num_entries); ut_free(page_arr); } +/** Drop the adaptive hash index for a tablespace. +@param[in,out] table table */ +UNIV_INTERN void buf_LRU_drop_page_hash_for_tablespace(dict_table_t* table) +{ + for (dict_index_t* index = dict_table_get_first_index(table); + index != NULL; + index = dict_table_get_next_index(index)) { + if (btr_search_info_get_ref_count(btr_search_get_info(index), + index)) { + goto drop_ahi; + } + } + + return; +drop_ahi: + ulint id = table->space; + for (ulint i = 0; i < srv_buf_pool_instances; i++) { + buf_LRU_drop_page_hash_for_tablespace(buf_pool_from_array(i), + id); + } +} + /******************************************************************//** While flushing (or removing dirty) pages from a tablespace we don't want to hog the CPU and resources. Release the buffer pool and block @@ -733,18 +742,11 @@ buf_flush_dirty_pages(buf_pool_t* buf_pool, ulint id, const trx_t* trx) /** Empty the flush list for all pages belonging to a tablespace. @param[in] id tablespace identifier @param[in] trx transaction, for checking for user interrupt; - or NULL if nothing is to be written -@param[in] drop_ahi whether to drop the adaptive hash index */ -UNIV_INTERN -void -buf_LRU_flush_or_remove_pages(ulint id, const trx_t* trx, bool drop_ahi) + or NULL if nothing is to be written */ +UNIV_INTERN void buf_LRU_flush_or_remove_pages(ulint id, const trx_t* trx) { for (ulint i = 0; i < srv_buf_pool_instances; i++) { - buf_pool_t* buf_pool = buf_pool_from_array(i); - if (drop_ahi) { - buf_LRU_drop_page_hash_for_tablespace(buf_pool, id); - } - buf_flush_dirty_pages(buf_pool, id, trx); + buf_flush_dirty_pages(buf_pool_from_array(i), id, trx); } if (trx && !trx_is_interrupted(trx)) { diff --git a/storage/xtradb/dict/dict0dict.cc b/storage/xtradb/dict/dict0dict.cc index e361b73ab77..7ade6d79adf 100644 --- a/storage/xtradb/dict/dict0dict.cc +++ b/storage/xtradb/dict/dict0dict.cc @@ -2,7 +2,7 @@ Copyright (c) 1996, 2017, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2012, Facebook Inc. -Copyright (c) 2013, 2017, MariaDB Corporation. +Copyright (c) 2013, 2018, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -1680,7 +1680,7 @@ dict_table_rename_in_cache( filepath = fil_make_ibd_name(table->name, false); } - fil_delete_tablespace(table->space, true); + fil_delete_tablespace(table->space); /* Delete any temp file hanging around. */ if (os_file_status(filepath, &exists, &ftype) @@ -2729,36 +2729,12 @@ dict_index_remove_from_cache_low( zero. See also: dict_table_can_be_evicted() */ do { - ulint ref_count = btr_search_info_get_ref_count(info, - index); - - if (ref_count == 0) { + if (!btr_search_info_get_ref_count(info, index)) { break; } - /* Sleep for 10ms before trying again. */ - os_thread_sleep(10000); - ++retries; - - if (retries % 500 == 0) { - /* No luck after 5 seconds of wait. */ - fprintf(stderr, "InnoDB: Error: Waited for" - " %lu secs for hash index" - " ref_count (%lu) to drop" - " to 0.\n" - "index: \"%s\"" - " table: \"%s\"\n", - retries/100, - ref_count, - index->name, - table->name); - } - - /* To avoid a hang here we commit suicide if the - ref_count doesn't drop to zero in 600 seconds. */ - if (retries >= 60000) { - ut_error; - } + buf_LRU_drop_page_hash_for_tablespace(table); + ut_a(++retries < 10000); } while (srv_shutdown_state == SRV_SHUTDOWN_NONE || !lru_evict); rw_lock_free(&index->lock); diff --git a/storage/xtradb/fil/fil0fil.cc b/storage/xtradb/fil/fil0fil.cc index a24b319fda6..9bd26fcf35b 100644 --- a/storage/xtradb/fil/fil0fil.cc +++ b/storage/xtradb/fil/fil0fil.cc @@ -2936,7 +2936,7 @@ fil_delete_tablespace(ulint id, bool drop_ahi) To deal with potential read requests by checking the ::stop_new_ops flag in fil_io() */ - buf_LRU_flush_or_remove_pages(id, NULL, drop_ahi); + buf_LRU_flush_or_remove_pages(id, NULL); #endif /* !UNIV_HOTBACKUP */ @@ -3047,7 +3047,7 @@ fil_discard_tablespace( { dberr_t err; - switch (err = fil_delete_tablespace(id, true)) { + switch (err = fil_delete_tablespace(id)) { case DB_SUCCESS: break; diff --git a/storage/xtradb/fsp/fsp0fsp.cc b/storage/xtradb/fsp/fsp0fsp.cc index ffed8a6edd3..f97e0c1331b 100644 --- a/storage/xtradb/fsp/fsp0fsp.cc +++ b/storage/xtradb/fsp/fsp0fsp.cc @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2017, MariaDB Corporation. +Copyright (c) 2017, 2018, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -3035,7 +3035,7 @@ fseg_free_page_low( /* Drop search system page hash index if the page is found in the pool and is hashed */ - btr_search_drop_page_hash_when_freed(space, zip_size, page); + btr_search_drop_page_hash_when_freed(space, page); descr = xdes_get_descriptor(space, zip_size, page, mtr); @@ -3261,7 +3261,7 @@ fseg_free_extent( found in the pool and is hashed */ btr_search_drop_page_hash_when_freed( - space, zip_size, first_page_in_extent + i); + space, first_page_in_extent + i); } } diff --git a/storage/xtradb/include/btr0sea.h b/storage/xtradb/include/btr0sea.h index bfe2c43defb..55366d3c0d3 100644 --- a/storage/xtradb/include/btr0sea.h +++ b/storage/xtradb/include/btr0sea.h @@ -1,6 +1,7 @@ /***************************************************************************** Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2018, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -142,17 +143,11 @@ btr_search_drop_page_hash_index( s- or x-latched, or an index page for which we know that block->buf_fix_count == 0 */ -/********************************************************************//** -Drops a possible page hash index when a page is evicted from the buffer pool -or freed in a file segment. */ +/** Drop possible adaptive hash index entries when a page is evicted +from the buffer pool or freed in a file, or the index is being dropped. */ UNIV_INTERN void -btr_search_drop_page_hash_when_freed( -/*=================================*/ - ulint space, /*!< in: space id */ - ulint zip_size, /*!< in: compressed page size in bytes - or 0 for uncompressed pages */ - ulint page_no); /*!< in: page number */ +btr_search_drop_page_hash_when_freed(ulint space, ulint page_no); /********************************************************************//** Updates the page hash index when a single record is inserted on a page. */ UNIV_INTERN diff --git a/storage/xtradb/include/buf0lru.h b/storage/xtradb/include/buf0lru.h index 1bc11937fa1..f0ba1bb227d 100644 --- a/storage/xtradb/include/buf0lru.h +++ b/storage/xtradb/include/buf0lru.h @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2017, MariaDB Corporation. +Copyright (c) 2017, 2018, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -36,6 +36,7 @@ Created 11/5/1995 Heikki Tuuri // Forward declaration struct trx_t; +struct dict_table_t; /******************************************************************//** Returns TRUE if less than 25 % of the buffer pool is available. This can be @@ -54,14 +55,15 @@ These are low-level functions /** Minimum LRU list length for which the LRU_old pointer is defined */ #define BUF_LRU_OLD_MIN_LEN 512 /* 8 megabytes of 16k pages */ +/** Drop the adaptive hash index for a tablespace. +@param[in,out] table table */ +UNIV_INTERN void buf_LRU_drop_page_hash_for_tablespace(dict_table_t* table); + /** Empty the flush list for all pages belonging to a tablespace. @param[in] id tablespace identifier @param[in] trx transaction, for checking for user interrupt; - or NULL if nothing is to be written -@param[in] drop_ahi whether to drop the adaptive hash index */ -UNIV_INTERN -void -buf_LRU_flush_or_remove_pages(ulint id, const trx_t* trx, bool drop_ahi=false); + or NULL if nothing is to be written */ +UNIV_INTERN void buf_LRU_flush_or_remove_pages(ulint id, const trx_t* trx); #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG /********************************************************************//** diff --git a/storage/xtradb/row/row0import.cc b/storage/xtradb/row/row0import.cc index 36ffba68291..a3478dc2fc4 100644 --- a/storage/xtradb/row/row0import.cc +++ b/storage/xtradb/row/row0import.cc @@ -31,6 +31,7 @@ Created 2012-02-08 by Sunny Bains. #endif #include "btr0pcur.h" +#include "btr0sea.h" #include "que0que.h" #include "dict0boot.h" #include "ibuf0ibuf.h" @@ -3982,6 +3983,17 @@ row_import_for_mysql( return(row_import_cleanup(prebuilt, trx, err)); } + /* On DISCARD TABLESPACE, we did not drop any adaptive hash + index entries. If we replaced the discarded tablespace with a + smaller one here, there could still be some adaptive hash + index entries that point to cached garbage pages in the buffer + pool, because PageConverter::operator() only evicted those + pages that were replaced by the imported pages. We must + discard all remaining adaptive hash index entries, because the + adaptive hash index must be a subset of the table contents; + false positives are not tolerated. */ + buf_LRU_drop_page_hash_for_tablespace(table); + row_mysql_lock_data_dictionary(trx); /* If the table is stored in a remote tablespace, we need to diff --git a/storage/xtradb/row/row0mysql.cc b/storage/xtradb/row/row0mysql.cc index 1acdfe53e0c..2b6f38ba2af 100644 --- a/storage/xtradb/row/row0mysql.cc +++ b/storage/xtradb/row/row0mysql.cc @@ -3540,6 +3540,8 @@ row_truncate_table_for_mysql( fil_space_release(space); } + buf_LRU_drop_page_hash_for_tablespace(table); + if (flags != ULINT_UNDEFINED && fil_discard_tablespace(space_id) == DB_SUCCESS) { @@ -4239,6 +4241,21 @@ row_drop_table_for_mysql( rw_lock_x_unlock(dict_index_get_lock(index)); } + if (table->space != TRX_SYS_SPACE) { + /* On DISCARD TABLESPACE, we would not drop the + adaptive hash index entries. If the tablespace is + missing here, delete-marking the record in SYS_INDEXES + would not free any pages in the buffer pool. Thus, + dict_index_remove_from_cache() would hang due to + adaptive hash index entries existing in the buffer + pool. To prevent this hang, and also to guarantee + that btr_search_drop_page_hash_when_freed() will avoid + calling btr_search_drop_page_hash_index() while we + hold the InnoDB dictionary lock, we will drop any + adaptive hash index entries upfront. */ + buf_LRU_drop_page_hash_for_tablespace(table); + } + /* We use the private SQL parser of Innobase to generate the query graphs needed in deleting the dictionary data from system tables in Innobase. Deleting a row from SYS_INDEXES table also