From 5a61fd58197deee41b14a02a18550ef628b09442 Mon Sep 17 00:00:00 2001 From: Oleksandr Byelkin Date: Thu, 29 Aug 2024 12:30:05 +0200 Subject: [PATCH 01/21] MDEV-34831: MDEV-34704 introduces a typo, --qick Fix MDEV-34704 typos. --- client/mysql.cc | 3 ++- mysql-test/main/client.result | 2 +- mysql-test/main/client.test | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/client/mysql.cc b/client/mysql.cc index 37f8482cfc5..aadfa72fcf7 100644 --- a/client/mysql.cc +++ b/client/mysql.cc @@ -1681,7 +1681,8 @@ static struct my_option my_long_options[] = "if the output is suspended. Doesn't use history file.", &quick, &quick, 0, GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0}, {"quick-max-column-width", 0, - "Maximal field length limit in case of --qick", &quick_max_column_width, + "Maximum number of characters displayed in a column header" + " when using --quick", &quick_max_column_width, &quick_max_column_width, 0, GET_ULONG, REQUIRED_ARG, LONG_MAX, 0, ULONG_MAX, 0, 1, 0}, {"raw", 'r', "Write fields without conversion. Used with --batch.", diff --git a/mysql-test/main/client.result b/mysql-test/main/client.result index bc3b9f64e81..86620d58bc6 100644 --- a/mysql-test/main/client.result +++ b/mysql-test/main/client.result @@ -58,5 +58,5 @@ insert into t1 values ("01234", "0123456789", "01234567890123456789", "1"); +-----------+------------+----------------------+------+ drop table t1; # -# End of 10.7 tests +# End of 10.5 tests # diff --git a/mysql-test/main/client.test b/mysql-test/main/client.test index 20df85f0807..d7249c15e75 100644 --- a/mysql-test/main/client.test +++ b/mysql-test/main/client.test @@ -42,5 +42,5 @@ insert into t1 values ("01234", "0123456789", "01234567890123456789", "1"); drop table t1; --echo # ---echo # End of 10.7 tests +--echo # End of 10.5 tests --echo # From b1d74b7e7214178ab0e81b5fb4c27165ecd9f83f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Lindstr=C3=B6m?= Date: Mon, 29 Apr 2024 10:13:03 +0300 Subject: [PATCH 02/21] MDEV-33997 : Assertion `((WSREP_PROVIDER_EXISTS_ && this->variables.wsrep_on) && wsrep_emulate_bin_log) || mysql_bin_log.is_open()' failed in int THD::binlog_write_row(TABLE*, bool, const uchar*) Problem was that we did not found that table was partitioned and then we should find what is actual underlaying storage engine. We should not use RSU for !InnoDB tables. Signed-off-by: Julius Goryavsky --- mysql-test/suite/wsrep/r/MDEV-33997.result | 38 ++++++++++++++ mysql-test/suite/wsrep/t/MDEV-33997.cnf | 9 ++++ .../suite/wsrep/t/MDEV-33997.combinations | 4 ++ mysql-test/suite/wsrep/t/MDEV-33997.test | 49 +++++++++++++++++++ sql/log_event_server.cc | 14 +++--- sql/sql_parse.cc | 7 +++ 6 files changed, 115 insertions(+), 6 deletions(-) create mode 100644 mysql-test/suite/wsrep/r/MDEV-33997.result create mode 100644 mysql-test/suite/wsrep/t/MDEV-33997.cnf create mode 100644 mysql-test/suite/wsrep/t/MDEV-33997.combinations create mode 100644 mysql-test/suite/wsrep/t/MDEV-33997.test diff --git a/mysql-test/suite/wsrep/r/MDEV-33997.result b/mysql-test/suite/wsrep/r/MDEV-33997.result new file mode 100644 index 00000000000..f0e43462054 --- /dev/null +++ b/mysql-test/suite/wsrep/r/MDEV-33997.result @@ -0,0 +1,38 @@ +SET SESSION wsrep_osu_method=RSU; +SET autocommit=0; +CREATE TABLE t (c INT) ENGINE=INNODB PARTITION BY KEY(c) PARTITIONS 2; +INSERT INTO t VALUES (1); +INSERT INTO t SELECT 1 ; +COMMIT; +SELECT * FROM t; +c +1 +1 +DROP TABLE t; +SET autocommit=1; +SET SESSION wsrep_osu_method=RSU; +CREATE TABLE t (c INT) ENGINE=INNODB PARTITION BY KEY(c) PARTITIONS 2; +INSERT INTO t SELECT 1 ; +SELECT * FROM t; +c +1 +DROP TABLE t; +SET autocommit=1; +SET SESSION wsrep_osu_method=RSU; +CREATE TABLE t (c INT) ENGINE=MYISAM PARTITION BY KEY(c) PARTITIONS 2; +INSERT INTO t SELECT 1 ; +ERROR 42000: This version of MariaDB doesn't yet support 'RSU on this table engine' +SELECT * FROM t; +c +DROP TABLE t; +SET SESSION wsrep_osu_method=RSU; +SET autocommit=0; +CREATE TABLE t (c INT) ENGINE=MYISAM PARTITION BY KEY(c) PARTITIONS 2; +INSERT INTO t VALUES (1); +INSERT INTO t SELECT 1 ; +ERROR 42000: This version of MariaDB doesn't yet support 'RSU on this table engine' +COMMIT; +SELECT * FROM t; +c +1 +DROP TABLE t; diff --git a/mysql-test/suite/wsrep/t/MDEV-33997.cnf b/mysql-test/suite/wsrep/t/MDEV-33997.cnf new file mode 100644 index 00000000000..489c4385dbd --- /dev/null +++ b/mysql-test/suite/wsrep/t/MDEV-33997.cnf @@ -0,0 +1,9 @@ +!include ../my.cnf + +[mysqld.1] +wsrep-on=ON +binlog-format=ROW +innodb-flush-log-at-trx-commit=1 +wsrep-cluster-address=gcomm:// +wsrep-provider=@ENV.WSREP_PROVIDER +innodb-autoinc-lock-mode=2 diff --git a/mysql-test/suite/wsrep/t/MDEV-33997.combinations b/mysql-test/suite/wsrep/t/MDEV-33997.combinations new file mode 100644 index 00000000000..1ce3b45aa1a --- /dev/null +++ b/mysql-test/suite/wsrep/t/MDEV-33997.combinations @@ -0,0 +1,4 @@ +[binlogon] +log-bin + +[binlogoff] diff --git a/mysql-test/suite/wsrep/t/MDEV-33997.test b/mysql-test/suite/wsrep/t/MDEV-33997.test new file mode 100644 index 00000000000..3d015244d7d --- /dev/null +++ b/mysql-test/suite/wsrep/t/MDEV-33997.test @@ -0,0 +1,49 @@ +--source include/have_wsrep.inc +--source include/have_innodb.inc +--source include/have_wsrep_provider.inc +--source include/have_partition.inc +# +# MDEV-33997: Assertion `((WSREP_PROVIDER_EXISTS_ && this->variables.wsrep_on) && wsrep_emulate_bin_log) || mysql_bin_log.is_open()' failed in int THD::binlog_write_row(TABLE*, bool, const uchar*) +# +SET SESSION wsrep_osu_method=RSU; +SET autocommit=0; + +CREATE TABLE t (c INT) ENGINE=INNODB PARTITION BY KEY(c) PARTITIONS 2; +INSERT INTO t VALUES (1); +INSERT INTO t SELECT 1 ; +COMMIT; +SELECT * FROM t; +DROP TABLE t; + +# +# MDEV-27296 : Assertion `((thd && (WSREP_PROVIDER_EXISTS_ && thd->variables.wsrep_on)) && wsrep_emulate_bin_log) || mysql_bin_log.is_open()' failed +# Second test case +# +SET autocommit=1; +SET SESSION wsrep_osu_method=RSU; +CREATE TABLE t (c INT) ENGINE=INNODB PARTITION BY KEY(c) PARTITIONS 2; +INSERT INTO t SELECT 1 ; +SELECT * FROM t; +DROP TABLE t; + +# +# We should not allow RSU for MyISAM +# +SET autocommit=1; +SET SESSION wsrep_osu_method=RSU; +CREATE TABLE t (c INT) ENGINE=MYISAM PARTITION BY KEY(c) PARTITIONS 2; +--error ER_NOT_SUPPORTED_YET +INSERT INTO t SELECT 1 ; +SELECT * FROM t; +DROP TABLE t; + +SET SESSION wsrep_osu_method=RSU; +SET autocommit=0; + +CREATE TABLE t (c INT) ENGINE=MYISAM PARTITION BY KEY(c) PARTITIONS 2; +INSERT INTO t VALUES (1); +--error ER_NOT_SUPPORTED_YET +INSERT INTO t SELECT 1 ; +COMMIT; +SELECT * FROM t; +DROP TABLE t; diff --git a/sql/log_event_server.cc b/sql/log_event_server.cc index d8056fd4378..69a27ed6350 100644 --- a/sql/log_event_server.cc +++ b/sql/log_event_server.cc @@ -5519,13 +5519,15 @@ int Rows_log_event::do_apply_event(rpl_group_info *rgi) #ifdef WITH_WSREP if (WSREP(thd)) { - WSREP_WARN("BF applier failed to open_and_lock_tables: %u, fatal: %d " + WSREP_WARN("BF applier thread=%lu failed to open_and_lock_tables for " + "%s, fatal: %d " "wsrep = (exec_mode: %d conflict_state: %d seqno: %lld)", - thd->get_stmt_da()->sql_errno(), - thd->is_fatal_error, - thd->wsrep_cs().mode(), - thd->wsrep_trx().state(), - (long long) wsrep_thd_trx_seqno(thd)); + thd_get_thread_id(thd), + thd->get_stmt_da()->message(), + thd->is_fatal_error, + thd->wsrep_cs().mode(), + thd->wsrep_trx().state(), + wsrep_thd_trx_seqno(thd)); } #endif /* WITH_WSREP */ if (thd->is_error() && diff --git a/sql/sql_parse.cc b/sql/sql_parse.cc index 2d6c7621f7b..01fa466780e 100644 --- a/sql/sql_parse.cc +++ b/sql/sql_parse.cc @@ -4759,6 +4759,13 @@ mysql_execute_command(THD *thd) thd->wsrep_consistency_check= NO_CONSISTENCY_CHECK; } + /* Only TOI allowed to !InnoDB tables */ + if (!is_innodb && wsrep_OSU_method_get(thd) != WSREP_OSU_TOI) + { + my_error(ER_NOT_SUPPORTED_YET, MYF(0), "RSU on this table engine"); + break; + } + // For !InnoDB we start TOI if it is not yet started and hope for the best if (!is_innodb && !wsrep_toi) { From 9091afdc55ab2e6276256da59ce79e318abfb1cf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Lindstr=C3=B6m?= Date: Thu, 15 Jun 2023 14:51:56 +0300 Subject: [PATCH 03/21] MDEV-31173 : Server crashes when setting wsrep_cluster_address after adding invalid value to wsrep_allowlist table Problem was that wsrep_schema tables were not marked as category information. Fix allows access to wsrep_schema tables even when node is detached. This is 10.4-10.9 version of fix. Signed-off-by: Julius Goryavsky --- .../r/galera_wsrep_schema_detached.result | 27 +++++++++++++ .../t/galera_wsrep_schema_detached.test | 39 +++++++++++++++++++ sql/table.cc | 23 +++++------ sql/wsrep_schema.cc | 12 +++++- sql/wsrep_schema.h | 12 +++--- 5 files changed, 95 insertions(+), 18 deletions(-) create mode 100644 mysql-test/suite/galera/r/galera_wsrep_schema_detached.result create mode 100644 mysql-test/suite/galera/t/galera_wsrep_schema_detached.test diff --git a/mysql-test/suite/galera/r/galera_wsrep_schema_detached.result b/mysql-test/suite/galera/r/galera_wsrep_schema_detached.result new file mode 100644 index 00000000000..493bf9d3cac --- /dev/null +++ b/mysql-test/suite/galera/r/galera_wsrep_schema_detached.result @@ -0,0 +1,27 @@ +connection node_2; +connection node_1; +connection node_1; +connection node_2; +connection node_1; +call mtr.add_suppression("WSREP: async IST sender failed to serve.*"); +SET @wsrep_provider_options_orig = @@GLOBAL.wsrep_provider_options; +connection node_2; +SET @wsrep_cluster_address_orig = @@GLOBAL.wsrep_cluster_address; +SET GLOBAL WSREP_ON=0; +SELECT COUNT(*) AS EXPECT_0 FROM mysql.wsrep_streaming_log; +EXPECT_0 +0 +SELECT COUNT(*) AS EXPECT_1 FROM mysql.wsrep_cluster; +EXPECT_1 +1 +SELECT COUNT(*) AS EXPECT_2 FROM mysql.wsrep_cluster_members; +EXPECT_2 +2 +connection node_1; +SET GLOBAL wsrep_provider_options ='pc.ignore_sb=true'; +connection node_2; +Killing server ... +connection node_1; +connection node_2; +connection node_1; +SET GLOBAL wsrep_provider_options ='pc.ignore_sb=false'; diff --git a/mysql-test/suite/galera/t/galera_wsrep_schema_detached.test b/mysql-test/suite/galera/t/galera_wsrep_schema_detached.test new file mode 100644 index 00000000000..f67091a5fb9 --- /dev/null +++ b/mysql-test/suite/galera/t/galera_wsrep_schema_detached.test @@ -0,0 +1,39 @@ +--source include/galera_cluster.inc + +# Save original auto_increment_offset values. +--let $node_1=node_1 +--let $node_2=node_2 +--source include/auto_increment_offset_save.inc + +--connection node_1 +call mtr.add_suppression("WSREP: async IST sender failed to serve.*"); +SET @wsrep_provider_options_orig = @@GLOBAL.wsrep_provider_options; + +--connection node_2 +SET @wsrep_cluster_address_orig = @@GLOBAL.wsrep_cluster_address; +SET GLOBAL WSREP_ON=0; +SELECT COUNT(*) AS EXPECT_0 FROM mysql.wsrep_streaming_log; +SELECT COUNT(*) AS EXPECT_1 FROM mysql.wsrep_cluster; +SELECT COUNT(*) AS EXPECT_2 FROM mysql.wsrep_cluster_members; + +--connection node_1 +SET GLOBAL wsrep_provider_options ='pc.ignore_sb=true'; + +--connection node_2 +--source include/kill_galera.inc + +--connection node_1 +--let $wait_condition = SELECT VARIABLE_VALUE = 1 FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_cluster_size' +--source include/wait_condition.inc + +--connection node_2 +--source include/start_mysqld.inc + +--connection node_1 +--let $wait_condition = SELECT VARIABLE_VALUE = 2 FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_cluster_size' +--source include/wait_condition.inc + +SET GLOBAL wsrep_provider_options ='pc.ignore_sb=false'; + +# Cleanup +--source include/auto_increment_offset_restore.inc diff --git a/sql/table.cc b/sql/table.cc index 74b478cb816..f0020deb519 100644 --- a/sql/table.cc +++ b/sql/table.cc @@ -276,17 +276,6 @@ const char *fn_frm_ext(const char *name) TABLE_CATEGORY get_table_category(const Lex_ident_db &db, const Lex_ident_table &name) { -#ifdef WITH_WSREP - if (db.str && db.streq(MYSQL_SCHEMA_NAME)) - { - if (name.streq(Lex_ident_table{STRING_WITH_LEN(WSREP_STREAMING_TABLE)}) || - name.streq(Lex_ident_table{STRING_WITH_LEN(WSREP_CLUSTER_TABLE)}) || - name.streq(Lex_ident_table{STRING_WITH_LEN(WSREP_MEMBERS_TABLE)})) - { - return TABLE_CATEGORY_INFORMATION; - } - } -#endif /* WITH_WSREP */ if (is_infoschema_db(&db)) return TABLE_CATEGORY_INFORMATION; @@ -308,6 +297,18 @@ TABLE_CATEGORY get_table_category(const Lex_ident_db &db, return TABLE_CATEGORY_LOG; } +#ifdef WITH_WSREP + if (db.streq(WSREP_LEX_SCHEMA)) + { + if(name.streq(WSREP_LEX_STREAMING)) + return TABLE_CATEGORY_INFORMATION; + if (name.streq(WSREP_LEX_CLUSTER)) + return TABLE_CATEGORY_INFORMATION; + if (name.streq(WSREP_LEX_MEMBERS)) + return TABLE_CATEGORY_INFORMATION; + } +#endif /* WITH_WSREP */ + return TABLE_CATEGORY_USER; } diff --git a/sql/wsrep_schema.cc b/sql/wsrep_schema.cc index abc387cc15f..4cb839053c3 100644 --- a/sql/wsrep_schema.cc +++ b/sql/wsrep_schema.cc @@ -1,4 +1,4 @@ -/* Copyright (C) 2015-2021 Codership Oy +/* Copyright (C) 2015-2023 Codership Oy This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -35,6 +35,16 @@ #include #include +#define WSREP_SCHEMA "mysql" +#define WSREP_STREAMING_TABLE "wsrep_streaming_log" +#define WSREP_CLUSTER_TABLE "wsrep_cluster" +#define WSREP_MEMBERS_TABLE "wsrep_cluster_members" + +LEX_CSTRING WSREP_LEX_SCHEMA= {STRING_WITH_LEN(WSREP_SCHEMA)}; +LEX_CSTRING WSREP_LEX_STREAMING= {STRING_WITH_LEN(WSREP_STREAMING_TABLE)}; +LEX_CSTRING WSREP_LEX_CLUSTER= {STRING_WITH_LEN(WSREP_CLUSTER_TABLE)}; +LEX_CSTRING WSREP_LEX_MEMBERS= {STRING_WITH_LEN(WSREP_MEMBERS_TABLE)}; + const char* wsrep_sr_table_name_full= WSREP_SCHEMA "/" WSREP_STREAMING_TABLE; static const std::string wsrep_schema_str= WSREP_SCHEMA; diff --git a/sql/wsrep_schema.h b/sql/wsrep_schema.h index 979b175481c..f0f79046768 100644 --- a/sql/wsrep_schema.h +++ b/sql/wsrep_schema.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2015-2019 Codership Oy +/* Copyright (C) 2015-2023 Codership Oy This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -33,11 +33,6 @@ struct TABLE_LIST; struct st_mysql_lex_string; typedef struct st_mysql_lex_string LEX_STRING; -#define WSREP_SCHEMA "mysql" -#define WSREP_STREAMING_TABLE "wsrep_streaming_log" -#define WSREP_CLUSTER_TABLE "wsrep_cluster" -#define WSREP_MEMBERS_TABLE "wsrep_cluster_members" - /** Name of the table in `wsrep_schema_str` used for storing streaming replication data. In an InnoDB full format, e.g. "database/tablename". */ extern const char* wsrep_sr_table_name_full; @@ -146,4 +141,9 @@ class Wsrep_schema extern Wsrep_schema* wsrep_schema; +extern LEX_CSTRING WSREP_LEX_SCHEMA; +extern LEX_CSTRING WSREP_LEX_STREAMING; +extern LEX_CSTRING WSREP_LEX_CLUSTER; +extern LEX_CSTRING WSREP_LEX_MEMBERS; + #endif /* !WSREP_SCHEMA_H */ From 74d716876537e244ae20a71874c9e670e0cfc930 Mon Sep 17 00:00:00 2001 From: Igor Babaev Date: Fri, 30 Aug 2024 10:28:28 -0700 Subject: [PATCH 04/21] MDEV-25084 Assertion failure when moving equality from having to where This bug was fixed by the patch for bug MDEV-26402. Only a test case that failed before this patch was applied is added in this commit. --- mysql-test/main/having_cond_pushdown.result | 10 ++++++++++ mysql-test/main/having_cond_pushdown.test | 12 ++++++++++++ 2 files changed, 22 insertions(+) diff --git a/mysql-test/main/having_cond_pushdown.result b/mysql-test/main/having_cond_pushdown.result index 3531c855781..c5877da4abd 100644 --- a/mysql-test/main/having_cond_pushdown.result +++ b/mysql-test/main/having_cond_pushdown.result @@ -5187,4 +5187,14 @@ HAVING (SELECT MAX(b) FROM t1) = a AND a > b; a b 2 1 DROP TABLE t1; +# +# MDEV-25084: Moving equality with constant right side +# from HAVING to WHERE +# (fixed by the patch for MDEV-26402) +# +CREATE TABLE t1 (a CHAR(3)) CHARSET=sjis; +INSERT INTO t1 VALUES ('foo'),('bar'); +SELECT LOAD_FILE('') AS f, a FROM t1 GROUP BY f, a HAVING f = a; +f a +DROP TABLE t1; End of 10.5 tests diff --git a/mysql-test/main/having_cond_pushdown.test b/mysql-test/main/having_cond_pushdown.test index fd414e62a8c..51a346b0ad4 100644 --- a/mysql-test/main/having_cond_pushdown.test +++ b/mysql-test/main/having_cond_pushdown.test @@ -1584,4 +1584,16 @@ eval $q; DROP TABLE t1; + +--echo # +--echo # MDEV-25084: Moving equality with constant right side +--echo # from HAVING to WHERE +--echo # (fixed by the patch for MDEV-26402) +--echo # + +CREATE TABLE t1 (a CHAR(3)) CHARSET=sjis; +INSERT INTO t1 VALUES ('foo'),('bar'); +SELECT LOAD_FILE('') AS f, a FROM t1 GROUP BY f, a HAVING f = a; +DROP TABLE t1; + --echo End of 10.5 tests From b65bbb2fae419eef54cb79584e618b3ef5409aa2 Mon Sep 17 00:00:00 2001 From: Julius Goryavsky Date: Fri, 30 Aug 2024 15:49:51 +0200 Subject: [PATCH 05/21] MDEV-34647: small refactoring after main fix --- sql/sql_parse.cc | 52 ++++++++++++++++++++++++------------------------ 1 file changed, 26 insertions(+), 26 deletions(-) diff --git a/sql/sql_parse.cc b/sql/sql_parse.cc index 01fa466780e..377d3051794 100644 --- a/sql/sql_parse.cc +++ b/sql/sql_parse.cc @@ -4747,36 +4747,36 @@ mysql_execute_command(THD *thd) #ifdef WITH_WSREP if (wsrep && !first_table->view) { - bool is_innodb= first_table->table->file->partition_ht()->db_type == DB_TYPE_INNODB; - - // For consistency check inserted table needs to be InnoDB - if (!is_innodb && thd->wsrep_consistency_check != NO_CONSISTENCY_CHECK) + const legacy_db_type db_type= first_table->table->file->partition_ht()->db_type; + // For InnoDB we don't need to worry about anything here: + if (db_type != DB_TYPE_INNODB) { - push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, - HA_ERR_UNSUPPORTED, - "Galera cluster does support consistency check only" - " for InnoDB tables."); - thd->wsrep_consistency_check= NO_CONSISTENCY_CHECK; - } - - /* Only TOI allowed to !InnoDB tables */ - if (!is_innodb && wsrep_OSU_method_get(thd) != WSREP_OSU_TOI) - { - my_error(ER_NOT_SUPPORTED_YET, MYF(0), "RSU on this table engine"); - break; - } - - // For !InnoDB we start TOI if it is not yet started and hope for the best - if (!is_innodb && !wsrep_toi) - { - const legacy_db_type db_type= first_table->table->file->partition_ht()->db_type; - - /* Currently we support TOI for MyISAM only. */ - if (db_type == DB_TYPE_MYISAM && wsrep_replicate_myisam) - WSREP_TO_ISOLATION_BEGIN(first_table->db.str, first_table->table_name.str, NULL); + // For consistency check inserted table needs to be InnoDB + if (thd->wsrep_consistency_check != NO_CONSISTENCY_CHECK) + { + push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, + HA_ERR_UNSUPPORTED, + "Galera cluster does support consistency check only" + " for InnoDB tables."); + thd->wsrep_consistency_check= NO_CONSISTENCY_CHECK; + } + /* Only TOI allowed to !InnoDB tables */ + if (wsrep_OSU_method_get(thd) != WSREP_OSU_TOI) + { + my_error(ER_NOT_SUPPORTED_YET, MYF(0), "RSU on this table engine"); + break; + } + // For !InnoDB we start TOI if it is not yet started and hope for the best + if (!wsrep_toi) + { + /* Currently we support TOI for MyISAM only. */ + if (db_type == DB_TYPE_MYISAM && wsrep_replicate_myisam) + WSREP_TO_ISOLATION_BEGIN(first_table->db.str, first_table->table_name.str, NULL); + } } } #endif /* WITH_WSREP */ + /* Only the INSERT table should be merged. Other will be handled by select. From 54a10a429334a9579558a5d284c510d6f8b5bc97 Mon Sep 17 00:00:00 2001 From: Teemu Ollakka Date: Tue, 21 May 2024 12:40:19 +0300 Subject: [PATCH 06/21] MDEV-32363 Shut down Galera networking and logging on fatal signal When handling fatal signal, shut down Galera networking before printing out stack trace and writing core file. This is to achieve fail-silent semantics on crashes which may keep the process running for a long time, but not fully responding e.g. due to core dumping or symbol resolving. Also suppress all Galera/wsrep logging to avoid logging from background threads to garble crash information from signal handler. Notice that for fully fail-silent crash, Galera 26.4.19 is needed. Signed-off-by: Julius Goryavsky --- sql/signal_handler.cc | 8 ++++++++ sql/wsrep_server_service.cc | 9 ++++++++- sql/wsrep_server_service.h | 4 ++++ sql/wsrep_server_state.cc | 23 +++++++++++++++++++++++ sql/wsrep_server_state.h | 2 ++ 5 files changed, 45 insertions(+), 1 deletion(-) diff --git a/sql/signal_handler.cc b/sql/signal_handler.cc index 3d497e39289..002a4c244d1 100644 --- a/sql/signal_handler.cc +++ b/sql/signal_handler.cc @@ -25,6 +25,10 @@ #include "my_stacktrace.h" #include +#ifdef WITH_WSREP +#include "wsrep_server_state.h" +#endif /* WITH_WSREP */ + #ifdef __WIN__ #include #include @@ -221,6 +225,10 @@ extern "C" sig_handler handle_fatal_signal(int sig) "the equation.\n\n"); } +#ifdef WITH_WSREP + Wsrep_server_state::handle_fatal_signal(); +#endif /* WITH_WSREP */ + #ifdef HAVE_STACKTRACE thd= current_thd; diff --git a/sql/wsrep_server_service.cc b/sql/wsrep_server_service.cc index af2c3efd214..6184ba2df59 100644 --- a/sql/wsrep_server_service.cc +++ b/sql/wsrep_server_service.cc @@ -161,9 +161,16 @@ void Wsrep_server_service::bootstrap() wsrep_set_SE_checkpoint(wsrep::gtid::undefined(), wsrep_gtid_server.undefined()); } +static std::atomic suppress_logging{false}; +void wsrep_suppress_error_logging() { suppress_logging= true; } + void Wsrep_server_service::log_message(enum wsrep::log::level level, - const char* message) + const char *message) { + if (suppress_logging.load(std::memory_order_relaxed)) + { + return; + } switch (level) { case wsrep::log::debug: diff --git a/sql/wsrep_server_service.h b/sql/wsrep_server_service.h index 3a7da229cd4..9a1e148b55f 100644 --- a/sql/wsrep_server_service.h +++ b/sql/wsrep_server_service.h @@ -99,4 +99,8 @@ class Wsrep_storage_service; Wsrep_storage_service* wsrep_create_storage_service(THD *orig_thd, const char *ctx); +/** + Suppress all error logging from wsrep/Galera library. + */ +void wsrep_suppress_error_logging(); #endif /* WSREP_SERVER_SERVICE */ diff --git a/sql/wsrep_server_state.cc b/sql/wsrep_server_state.cc index 6bc4eaf4d86..a936d9dd79d 100644 --- a/sql/wsrep_server_state.cc +++ b/sql/wsrep_server_state.cc @@ -18,6 +18,8 @@ #include "wsrep_server_state.h" #include "wsrep_binlog.h" /* init/deinit group commit */ +#include "my_stacktrace.h" /* my_safe_printf_stderr() */ + mysql_mutex_t LOCK_wsrep_server_state; mysql_cond_t COND_wsrep_server_state; @@ -82,3 +84,24 @@ void Wsrep_server_state::destroy() mysql_cond_destroy(&COND_wsrep_server_state); } } + +void Wsrep_server_state::handle_fatal_signal() +{ + if (m_instance) + { + /* Galera background threads are still running and the logging may be + relatively verbose in case of networking error. Silence all wsrep + logging before shutting down networking to avoid garbling signal + handler output. */ + my_safe_printf_stderr("WSREP: Suppressing further logging\n"); + wsrep_suppress_error_logging(); + + /* Shut down all communication with other nodes to fail silently. */ + my_safe_printf_stderr("WSREP: Shutting down network communications\n"); + if (m_instance->provider().set_node_isolation( + wsrep::provider::node_isolation::isolated)) { + my_safe_printf_stderr("WSREP: Galera library does not support node isolation\n"); + } + my_safe_printf_stderr("\n"); + } +} diff --git a/sql/wsrep_server_state.h b/sql/wsrep_server_state.h index 1ef937300f6..43a93fd5aef 100644 --- a/sql/wsrep_server_state.h +++ b/sql/wsrep_server_state.h @@ -56,6 +56,8 @@ public: return (get_provider().capabilities() & capability); } + static void handle_fatal_signal(); + private: Wsrep_server_state(const std::string& name, const std::string& incoming_address, From 69c6cb5dc4907c9133bb6f8077f35ad5eafb7549 Mon Sep 17 00:00:00 2001 From: Alexey Yurchenko Date: Sun, 19 May 2024 00:23:00 +0300 Subject: [PATCH 07/21] Fix recovering state GTID in case log file contains non-text bytes - use grep with -a option. Signed-off-by: Julius Goryavsky --- mysql-test/suite/galera/include/galera_wsrep_recover.inc | 4 ++-- mysql-test/suite/galera/t/galera_pc_recovery.test | 4 ++-- mysql-test/suite/wsrep/t/wsrep-recover-step.inc | 2 +- scripts/galera_recovery.sh | 4 ++-- scripts/mysqld_safe.sh | 4 ++-- 5 files changed, 9 insertions(+), 9 deletions(-) diff --git a/mysql-test/suite/galera/include/galera_wsrep_recover.inc b/mysql-test/suite/galera/include/galera_wsrep_recover.inc index aa2f0e2e777..efe803dcc9f 100644 --- a/mysql-test/suite/galera/include/galera_wsrep_recover.inc +++ b/mysql-test/suite/galera/include/galera_wsrep_recover.inc @@ -10,8 +10,8 @@ if (!$wsrep_recover_additional) --perl use strict; - my $wsrep_start_position_str = "grep 'WSREP: Recovered position:' $ENV{MYSQL_TMP_DIR}/galera_wsrep_recover.log | sed 's/.*WSREP\:\ Recovered\ position://' | sed 's/^[ \t]*//'"; - my $wsrep_start_position = `grep 'WSREP: Recovered position:' $ENV{MYSQL_TMP_DIR}/galera_wsrep_recover.log | sed 's/.*WSREP\:\ Recovered\ position://' | sed 's/^[ \t]*//'`; + my $wsrep_start_position_str = "grep -a 'WSREP: Recovered position:' $ENV{MYSQL_TMP_DIR}/galera_wsrep_recover.log | sed 's/.*WSREP\:\ Recovered\ position://' | sed 's/^[ \t]*//'"; + my $wsrep_start_position = `grep -a 'WSREP: Recovered position:' $ENV{MYSQL_TMP_DIR}/galera_wsrep_recover.log | sed 's/.*WSREP\:\ Recovered\ position://' | sed 's/^[ \t]*//'`; chomp($wsrep_start_position); die if $wsrep_start_position eq ''; diff --git a/mysql-test/suite/galera/t/galera_pc_recovery.test b/mysql-test/suite/galera/t/galera_pc_recovery.test index 16abe6fc9ba..0fd9c8ab3d9 100644 --- a/mysql-test/suite/galera/t/galera_pc_recovery.test +++ b/mysql-test/suite/galera/t/galera_pc_recovery.test @@ -38,10 +38,10 @@ SELECT COUNT(*) = 1 FROM t1; --perl use strict; - my $wsrep_start_position1 = `grep 'WSREP: Recovered position:' $ENV{MYSQL_TMP_DIR}/galera_wsrep_recover.1.log | sed 's/.*WSREP\:\ Recovered\ position://' | sed 's/^[ \t]*//'`; + my $wsrep_start_position1 = `grep -a 'WSREP: Recovered position:' $ENV{MYSQL_TMP_DIR}/galera_wsrep_recover.1.log | sed 's/.*WSREP\:\ Recovered\ position://' | sed 's/^[ \t]*//'`; chomp($wsrep_start_position1); - my $wsrep_start_position2 = `grep 'WSREP: Recovered position:' $ENV{MYSQL_TMP_DIR}/galera_wsrep_recover.2.log | sed 's/.*WSREP\:\ Recovered\ position://' | sed 's/^[ \t]*//'`; + my $wsrep_start_position2 = `grep -a 'WSREP: Recovered position:' $ENV{MYSQL_TMP_DIR}/galera_wsrep_recover.2.log | sed 's/.*WSREP\:\ Recovered\ position://' | sed 's/^[ \t]*//'`; chomp($wsrep_start_position2); die if $wsrep_start_position1 eq '' || $wsrep_start_position2 eq ''; diff --git a/mysql-test/suite/wsrep/t/wsrep-recover-step.inc b/mysql-test/suite/wsrep/t/wsrep-recover-step.inc index 22669438fe0..b131ac07641 100644 --- a/mysql-test/suite/wsrep/t/wsrep-recover-step.inc +++ b/mysql-test/suite/wsrep/t/wsrep-recover-step.inc @@ -18,7 +18,7 @@ --perl use strict; - my $wsrep_start_position = `grep 'WSREP: Recovered position:' $ENV{MYSQL_TMP_DIR}/galera_wsrep_recover.log | sed 's/.*WSREP\:\ Recovered\ position://' | sed 's/^[ \t]*//'`; + my $wsrep_start_position = `grep -a 'WSREP: Recovered position:' $ENV{MYSQL_TMP_DIR}/galera_wsrep_recover.log | sed 's/.*WSREP\:\ Recovered\ position://' | sed 's/^[ \t]*//'`; chomp($wsrep_start_position); die if $wsrep_start_position eq ''; open(FILE, ">", "$ENV{MYSQL_TMP_DIR}/galera_wsrep_start_position.inc") or die; diff --git a/scripts/galera_recovery.sh b/scripts/galera_recovery.sh index 9ff5e4e5f50..50aeb3a0b91 100644 --- a/scripts/galera_recovery.sh +++ b/scripts/galera_recovery.sh @@ -81,10 +81,10 @@ wsrep_recover_position() { # Parse server's error log for recovered position. The server prints # "..skipping position recovery.." if started without wsrep. - recovered_pos="$(grep 'WSREP: Recovered position:' $log_file)" + recovered_pos="$(grep -a 'WSREP: Recovered position:' $log_file)" if [ -z "$recovered_pos" ]; then - skipped="$(grep WSREP $log_file | grep 'skipping position recovery')" + skipped="$(grep -a WSREP $log_file | grep 'skipping position recovery')" if [ -z "$skipped" ]; then log "WSREP: Failed to recover position: '`cat $log_file`'" exit 1 diff --git a/scripts/mysqld_safe.sh b/scripts/mysqld_safe.sh index be115066f80..074840e992d 100644 --- a/scripts/mysqld_safe.sh +++ b/scripts/mysqld_safe.sh @@ -260,9 +260,9 @@ wsrep_recover_position() { exit 1 fi - local rp="$(grep 'WSREP: Recovered position:' $wr_logfile)" + local rp="$(grep -a 'WSREP: Recovered position:' $wr_logfile)" if [ -z "$rp" ]; then - local skipped="$(grep WSREP $wr_logfile | grep 'skipping position recovery')" + local skipped="$(grep -a WSREP $wr_logfile | grep 'skipping position recovery')" if [ -z "$skipped" ]; then log_error "WSREP: Failed to recover position: '`cat $wr_logfile`'" ret=1 From 7119149f8394d063f2f512a7fefc1d733f048867 Mon Sep 17 00:00:00 2001 From: Alexey Yurchenko Date: Sun, 19 May 2024 21:26:46 +0300 Subject: [PATCH 08/21] If donor loop receives unknown signal from the SST script it is an error condition (SST failure), so it should set error code before exiting. Signed-off-by: Julius Goryavsky --- sql/wsrep_sst.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sql/wsrep_sst.cc b/sql/wsrep_sst.cc index 08a3d159e94..13500a83f43 100644 --- a/sql/wsrep_sst.cc +++ b/sql/wsrep_sst.cc @@ -1840,6 +1840,8 @@ wait_signal: else { WSREP_WARN("Received unknown signal: '%s'", out); + /* since it is the end of the loop, we must set error code */ + err=-EINVAL; proc.wait(); } } From 731a5aba0b6eb465e82a3f243c5daaf735f7313f Mon Sep 17 00:00:00 2001 From: Alexey Yurchenko Date: Sun, 19 May 2024 21:08:46 +0300 Subject: [PATCH 09/21] Use only MySQL code for TOI error vote For TOI events specifically we have a situation where in case of the same error different nodes may generate different messages. This may be for two reasons: - different locale setting between the current client session and server default (we can reasonably require server locales to be identical on all nodes, but user can change message locale for the session) - non-deterministic course of STATEMENT execution e.g. for ALTER TABLE On the other hand we may reasonably expect TOI event failures since they are executed after replication, so we must ensure that voting is consistent. For that purpose error codes should be sufficiently unique and deterministic for TOI event failures as DDLs normally deal with a single object, so we can merely use MySQL error codes to vote on. Notice that this problem does not happen with regular transactional writesets, since the originator node will always vote success and replica nodes are assumed to have the same global locale setting. As such different error messages indicate different errors even if the error code is the same (e.g. ER_DUP_KEY can happen on different rows tables). Use only MySQL error code (without the error message) for error voting in case of TOI event failure. Signed-off-by: Julius Goryavsky --- .../suite/galera_3nodes/r/galera_toi_vote.result | 3 ++- .../suite/galera_3nodes/t/galera_toi_vote.test | 3 +++ sql/wsrep_applier.cc | 16 +++++++++++++--- sql/wsrep_applier.h | 16 +++++++++++++++- sql/wsrep_high_priority_service.cc | 11 ++++++----- sql/wsrep_mysqld.cc | 5 ++++- 6 files changed, 43 insertions(+), 11 deletions(-) diff --git a/mysql-test/suite/galera_3nodes/r/galera_toi_vote.result b/mysql-test/suite/galera_3nodes/r/galera_toi_vote.result index d8d3abe40e9..345fa92c13d 100644 --- a/mysql-test/suite/galera_3nodes/r/galera_toi_vote.result +++ b/mysql-test/suite/galera_3nodes/r/galera_toi_vote.result @@ -7,8 +7,9 @@ connection node_3; SET SESSION wsrep_on=OFF; DROP SCHEMA test; connection node_1; +SET SESSION lc_messages='fr_FR'; CREATE SCHEMA test; -ERROR HY000: Can't create database 'test'; database exists +ERROR HY000: Ne peut créer la base 'test'; elle existe déjà connection node_1; SET SESSION wsrep_sync_wait=0; connection node_2; diff --git a/mysql-test/suite/galera_3nodes/t/galera_toi_vote.test b/mysql-test/suite/galera_3nodes/t/galera_toi_vote.test index 6bc87cf8874..bd53c510cd4 100644 --- a/mysql-test/suite/galera_3nodes/t/galera_toi_vote.test +++ b/mysql-test/suite/galera_3nodes/t/galera_toi_vote.test @@ -24,6 +24,9 @@ DROP SCHEMA test; # This should fail on nodes 1 and 2 and succeed on node 3 --connection node_1 +# Make error message on source node different by changing locale +# It should still agree with node 2 +SET SESSION lc_messages='fr_FR'; --error ER_DB_CREATE_EXISTS CREATE SCHEMA test; diff --git a/sql/wsrep_applier.cc b/sql/wsrep_applier.cc index 513e17b3045..0ec8e6e7f47 100644 --- a/sql/wsrep_applier.cc +++ b/sql/wsrep_applier.cc @@ -82,7 +82,9 @@ wsrep_get_apply_format(THD* thd) return thd->wsrep_rgi->rli->relay_log.description_event_for_exec; } -void wsrep_store_error(const THD* const thd, wsrep::mutable_buffer& dst) +void wsrep_store_error(const THD* const thd, + wsrep::mutable_buffer& dst, + bool const include_msg) { Diagnostics_area::Sql_condition_iterator it= thd->get_stmt_da()->sql_conditions(); @@ -100,8 +102,16 @@ void wsrep_store_error(const THD* const thd, wsrep::mutable_buffer& dst) uint const err_code= cond->get_sql_errno(); const char* const err_str= cond->get_message_text(); - slider+= my_snprintf(slider, buf_end - slider, " %s, Error_code: %d;", - err_str, err_code); + if (include_msg) + { + slider+= snprintf(slider, buf_end - slider, " %s, Error_code: %d;", + err_str, err_code); + } + else + { + slider+= snprintf(slider, buf_end - slider, " Error_code: %d;", + err_code); + } } if (slider != dst.data()) diff --git a/sql/wsrep_applier.h b/sql/wsrep_applier.h index fefca306a70..e633b1b9bf2 100644 --- a/sql/wsrep_applier.h +++ b/sql/wsrep_applier.h @@ -35,7 +35,21 @@ int wsrep_apply_events(THD* thd, #define WSREP_ERR_FAILED 6 // Operation failed for some internal reason #define WSREP_ERR_ABORTED 7 // Operation was aborted externally -void wsrep_store_error(const THD* thd, wsrep::mutable_buffer& buf); +/* Loops over THD diagnostic area and concatenates all error messages + * and error codes to a single continuous buffer to create a unique + * but consistent failure signature which provider can use for voting + * between the nodes in the cluster. + * + * @param thd THD context + * @param dst buffer to store the signature + * @param include_msg whether to use MySQL error message in addition to + * MySQL error code. Note that in the case of a TOI + * operation the message may be not consistent between + * the nodes e.g. due to a different client locale setting + * and should be omitted */ +void wsrep_store_error(const THD* thd, + wsrep::mutable_buffer& buf, + bool include_msg); class Format_description_log_event; void wsrep_set_apply_format(THD*, Format_description_log_event*); diff --git a/sql/wsrep_high_priority_service.cc b/sql/wsrep_high_priority_service.cc index 7eb3f07849a..2c0e2e643fa 100644 --- a/sql/wsrep_high_priority_service.cc +++ b/sql/wsrep_high_priority_service.cc @@ -123,14 +123,15 @@ static void wsrep_setup_uk_and_fk_checks(THD* thd) static int apply_events(THD* thd, Relay_log_info* rli, const wsrep::const_buffer& data, - wsrep::mutable_buffer& err) + wsrep::mutable_buffer& err, + bool const include_msg) { int const ret= wsrep_apply_events(thd, rli, data.data(), data.size()); if (ret || wsrep_thd_has_ignored_error(thd)) { if (ret) { - wsrep_store_error(thd, err); + wsrep_store_error(thd, err, include_msg); } wsrep_dump_rbr_buf_with_header(thd, data.data(), data.size()); } @@ -427,7 +428,7 @@ int Wsrep_high_priority_service::apply_toi(const wsrep::ws_meta& ws_meta, #endif thd->set_time(); - int ret= apply_events(thd, m_rli, data, err); + int ret= apply_events(thd, m_rli, data, err, false); wsrep_thd_set_ignored_error(thd, false); trans_commit(thd); @@ -595,7 +596,7 @@ int Wsrep_applier_service::apply_write_set(const wsrep::ws_meta& ws_meta, #endif /* ENABLED_DEBUG_SYNC */ wsrep_setup_uk_and_fk_checks(thd); - int ret= apply_events(thd, m_rli, data, err); + int ret= apply_events(thd, m_rli, data, err, true); thd->close_temporary_tables(); if (!ret && !(ws_meta.flags() & wsrep::provider::flag::commit)) @@ -764,7 +765,7 @@ int Wsrep_replayer_service::apply_write_set(const wsrep::ws_meta& ws_meta, ws_meta, thd->wsrep_sr().fragments()); } - ret= ret || apply_events(thd, m_rli, data, err); + ret= ret || apply_events(thd, m_rli, data, err, true); thd->close_temporary_tables(); if (!ret && !(ws_meta.flags() & wsrep::provider::flag::commit)) { diff --git a/sql/wsrep_mysqld.cc b/sql/wsrep_mysqld.cc index e584c2cc144..6bb879a0367 100644 --- a/sql/wsrep_mysqld.cc +++ b/sql/wsrep_mysqld.cc @@ -2489,7 +2489,10 @@ static void wsrep_TOI_end(THD *thd) { if (thd->is_error() && !wsrep_must_ignore_error(thd)) { - wsrep_store_error(thd, err); + /* use only error code, for the message can be inconsistent + * between the nodes due to differing lc_message settings + * in client session and server applier thread */ + wsrep_store_error(thd, err, false); } int const ret= client_state.leave_toi_local(err); From 83196a7b2398ed3caa52a04e09b656828ad64804 Mon Sep 17 00:00:00 2001 From: Alexey Yurchenko Date: Sun, 19 May 2024 22:09:18 +0300 Subject: [PATCH 10/21] Add a basic MTR test for DDL error voting to ensure that all DDLs generate consistent error messages, Signed-off-by: Julius Goryavsky --- .../suite/galera/r/galera_vote_ddl.result | 70 +++++++++++++++++++ mysql-test/suite/galera/t/galera_vote_ddl.inc | 54 ++++++++++++++ .../suite/galera/t/galera_vote_ddl.test | 34 +++++++++ 3 files changed, 158 insertions(+) create mode 100644 mysql-test/suite/galera/r/galera_vote_ddl.result create mode 100644 mysql-test/suite/galera/t/galera_vote_ddl.inc create mode 100644 mysql-test/suite/galera/t/galera_vote_ddl.test diff --git a/mysql-test/suite/galera/r/galera_vote_ddl.result b/mysql-test/suite/galera/r/galera_vote_ddl.result new file mode 100644 index 00000000000..bc5d99256e5 --- /dev/null +++ b/mysql-test/suite/galera/r/galera_vote_ddl.result @@ -0,0 +1,70 @@ +connection node_2; +connection node_1; +connection node_1; +SET @@global.wsrep_ignore_apply_errors = 7; +connection node_2; +SET @@global.wsrep_ignore_apply_errors = 7; +connection node_1; +CREATE TABLE t1 (pk INT AUTO_INCREMENT PRIMARY KEY); +connection node_1; +DROP TABLE nonexistent; +ERROR 42S02: Unknown table 'test.nonexistent' +INSERT INTO t1 VALUES (DEFAULT); +connection node_2; +connection node_1; +TRUNCATE TABLE nonexistent; +ERROR 42S02: Table 'test.nonexistent' doesn't exist +INSERT INTO t1 VALUES (DEFAULT); +connection node_2; +connection node_1; +CREATE TABLE nonexistent.t1 (s INT); +ERROR 42000: Unknown database 'nonexistent' +INSERT INTO t1 VALUES (DEFAULT); +connection node_2; +connection node_1; +CREATE TABLE t1 (s INT); +ERROR 42S01: Table 't1' already exists +INSERT INTO t1 VALUES (DEFAULT); +connection node_2; +connection node_1; +ALTER TABLE nonexistent ADD COLUMN (c INT); +ERROR 42S02: Table 'test.nonexistent' doesn't exist +INSERT INTO t1 VALUES (DEFAULT); +connection node_2; +DROP TABLE t1; +connection node_1; +SET @@global.wsrep_ignore_apply_errors = 0; +connection node_2; +SET @@global.wsrep_ignore_apply_errors = 0; +connection node_1; +CREATE TABLE t1 (pk INT AUTO_INCREMENT PRIMARY KEY); +connection node_1; +DROP TABLE nonexistent; +ERROR 42S02: Unknown table 'test.nonexistent' +INSERT INTO t1 VALUES (DEFAULT); +connection node_2; +connection node_1; +TRUNCATE TABLE nonexistent; +ERROR 42S02: Table 'test.nonexistent' doesn't exist +INSERT INTO t1 VALUES (DEFAULT); +connection node_2; +connection node_1; +CREATE TABLE nonexistent.t1 (s INT); +ERROR 42000: Unknown database 'nonexistent' +INSERT INTO t1 VALUES (DEFAULT); +connection node_2; +connection node_1; +CREATE TABLE t1 (s INT); +ERROR 42S01: Table 't1' already exists +INSERT INTO t1 VALUES (DEFAULT); +connection node_2; +connection node_1; +ALTER TABLE nonexistent ADD COLUMN (c INT); +ERROR 42S02: Table 'test.nonexistent' doesn't exist +INSERT INTO t1 VALUES (DEFAULT); +connection node_2; +DROP TABLE t1; +connection node_1; +SET @@global.wsrep_ignore_apply_errors = 7; +connection node_2; +SET @@global.wsrep_ignore_apply_errors = 7; diff --git a/mysql-test/suite/galera/t/galera_vote_ddl.inc b/mysql-test/suite/galera/t/galera_vote_ddl.inc new file mode 100644 index 00000000000..80a543fb886 --- /dev/null +++ b/mysql-test/suite/galera/t/galera_vote_ddl.inc @@ -0,0 +1,54 @@ +--connection node_1 +CREATE TABLE t1 (pk INT AUTO_INCREMENT PRIMARY KEY); + +--connection node_1 +--error 1051 +DROP TABLE nonexistent; + +# Verify cluster is intact +INSERT INTO t1 VALUES (DEFAULT); +--connection node_2 +--let $wait_condition = SELECT COUNT(*) = 1 FROM t1; +--source include/wait_condition.inc + +--connection node_1 +--error 1146 +TRUNCATE TABLE nonexistent; + +# Verify cluster is intact +INSERT INTO t1 VALUES (DEFAULT); +--connection node_2 +--let $wait_condition = SELECT COUNT(*) = 2 FROM t1; +--source include/wait_condition.inc + +--connection node_1 +--error 1049 +CREATE TABLE nonexistent.t1 (s INT); + +# Verify cluster is intact +INSERT INTO t1 VALUES (DEFAULT); +--connection node_2 +--let $wait_condition = SELECT COUNT(*) = 3 FROM t1; +--source include/wait_condition.inc + +--connection node_1 +--error 1050 +CREATE TABLE t1 (s INT); + +# Verify cluster is intact +INSERT INTO t1 VALUES (DEFAULT); +--connection node_2 +--let $wait_condition = SELECT COUNT(*) = 4 FROM t1; +--source include/wait_condition.inc + +--connection node_1 +--error 1146 +ALTER TABLE nonexistent ADD COLUMN (c INT); + +# Verify cluster is intact +INSERT INTO t1 VALUES (DEFAULT); +--connection node_2 +--let $wait_condition = SELECT COUNT(*) = 5 FROM t1; +--source include/wait_condition.inc + +DROP TABLE t1; diff --git a/mysql-test/suite/galera/t/galera_vote_ddl.test b/mysql-test/suite/galera/t/galera_vote_ddl.test new file mode 100644 index 00000000000..9db6e612e7e --- /dev/null +++ b/mysql-test/suite/galera/t/galera_vote_ddl.test @@ -0,0 +1,34 @@ +# +# Test voting on identical DDL errors (error messages should match) +# + +--source include/galera_cluster.inc +--source include/have_binlog_format_row.inc + +# +# 1. Ignore all DDL errors (avoids voting) +# +--connection node_1 +--let $wsrep_ignore_apply_errors_saved1 = `SELECT @@global.wsrep_ignore_apply_errors` +SET @@global.wsrep_ignore_apply_errors = 7; +--connection node_2 +--let $wsrep_ignore_apply_errors_saved2 = `SELECT @@global.wsrep_ignore_apply_errors` +SET @@global.wsrep_ignore_apply_errors = 7; + +--source galera_vote_ddl.inc + +# +# 2. Don't ignore any errors (forces voting) +# +--connection node_1 +SET @@global.wsrep_ignore_apply_errors = 0; +--connection node_2 +SET @@global.wsrep_ignore_apply_errors = 0; + +--source galera_vote_ddl.inc + +--connection node_1 +--eval SET @@global.wsrep_ignore_apply_errors = $wsrep_ignore_apply_errors_saved1 + +--connection node_2 +--eval SET @@global.wsrep_ignore_apply_errors = $wsrep_ignore_apply_errors_saved2 From dd64f29d6bea6f36be9634edd5c2d27f2205ca33 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Lindstr=C3=B6m?= Date: Fri, 12 Apr 2024 11:24:42 +0300 Subject: [PATCH 11/21] MDEV-33897 : Galera test failure on galera_3nodes.galera_gtid_consistency Based on logs SST was started before donor reached Primaty state. Add wait_conditions to make sure that nodes reach Primary state before starting next node. Signed-off-by: Julius Goryavsky --- .../r/galera_gtid_consistency.result | 33 +++++++------- .../t/galera_gtid_consistency.cnf | 35 ++++++--------- .../t/galera_gtid_consistency.test | 44 ++++++++++++++++--- 3 files changed, 71 insertions(+), 41 deletions(-) diff --git a/mysql-test/suite/galera_3nodes/r/galera_gtid_consistency.result b/mysql-test/suite/galera_3nodes/r/galera_gtid_consistency.result index ffc5ec0627a..91ff0342b8d 100644 --- a/mysql-test/suite/galera_3nodes/r/galera_gtid_consistency.result +++ b/mysql-test/suite/galera_3nodes/r/galera_gtid_consistency.result @@ -1,6 +1,9 @@ connection node_2; connection node_1; connect node_3, 127.0.0.1, root, , test, $NODE_MYPORT_3; +connection node_1; +connection node_2; +connection node_3; connect node_2b, 127.0.0.1, root, , test, $NODE_MYPORT_2; set wsrep_sync_wait=0; connect node_1b, 127.0.0.1, root, , test, $NODE_MYPORT_1; @@ -44,9 +47,9 @@ connection node_1b; connection node_1; connection node_3; connection node_1; -CALL insert_row('node1', 500); +CALL insert_row('node1', 100); connection node_3; -CALL insert_row('node3', 500); +CALL insert_row('node3', 100); CREATE TABLE t2(i int primary key) engine=innodb; connection node_2; # Restart node_2 @@ -60,7 +63,7 @@ Variable_name Value wsrep_gtid_domain_id 1111 show variables like '%gtid_binlog_pos%'; Variable_name Value -gtid_binlog_pos 1111-1-2503 +gtid_binlog_pos 1111-1-1703 connection node_2; # GTID in node2 show variables like 'wsrep_gtid_domain_id'; @@ -68,7 +71,7 @@ Variable_name Value wsrep_gtid_domain_id 1111 show variables like '%gtid_binlog_pos%'; Variable_name Value -gtid_binlog_pos 1111-1-2503 +gtid_binlog_pos 1111-1-1703 connection node_3; # GTID in node3 show variables like 'wsrep_gtid_domain_id'; @@ -76,7 +79,7 @@ Variable_name Value wsrep_gtid_domain_id 1111 show variables like '%gtid_binlog_pos%'; Variable_name Value -gtid_binlog_pos 1111-1-2503 +gtid_binlog_pos 1111-1-1703 # Shutdown node_3 connection node_3; SET GLOBAL wsrep_provider_options = 'gmcast.isolate = 1'; @@ -98,7 +101,7 @@ Variable_name Value wsrep_gtid_domain_id 1111 show variables like '%gtid_binlog_pos%'; Variable_name Value -gtid_binlog_pos 1111-1-2554 +gtid_binlog_pos 1111-1-1754 connection node_2; # GTID in node2 show variables like 'wsrep_gtid_domain_id'; @@ -106,7 +109,7 @@ Variable_name Value wsrep_gtid_domain_id 1111 show variables like '%gtid_binlog_pos%'; Variable_name Value -gtid_binlog_pos 1111-1-2554 +gtid_binlog_pos 1111-1-1754 connection node_3; # GTID in node3 show variables like 'wsrep_gtid_domain_id'; @@ -114,7 +117,7 @@ Variable_name Value wsrep_gtid_domain_id 1111 show variables like '%gtid_binlog_pos%'; Variable_name Value -gtid_binlog_pos 1111-1-2554 +gtid_binlog_pos 1111-1-1754 # One by one shutdown all nodes connection node_3; # shutdown node_3 @@ -132,7 +135,7 @@ Variable_name Value wsrep_gtid_domain_id 1111 show variables like '%gtid_binlog_pos%'; Variable_name Value -gtid_binlog_pos 1111-1-2554 +gtid_binlog_pos 1111-1-1754 ANALYZE TABLE t2; Table Op Msg_type Msg_text test.t2 analyze status Engine-independent statistics collected @@ -163,7 +166,7 @@ Variable_name Value wsrep_gtid_domain_id 1111 show variables like '%gtid_binlog_pos%'; Variable_name Value -gtid_binlog_pos 1111-1-2756 +gtid_binlog_pos 1111-1-1956 connection node_2; node2 GTID show variables like 'wsrep_gtid_domain_id'; @@ -171,7 +174,7 @@ Variable_name Value wsrep_gtid_domain_id 1111 show variables like '%gtid_binlog_pos%'; Variable_name Value -gtid_binlog_pos 1111-1-2756 +gtid_binlog_pos 1111-1-1956 connection node_3; node3 GTID show variables like 'wsrep_gtid_domain_id'; @@ -179,22 +182,22 @@ Variable_name Value wsrep_gtid_domain_id 1111 show variables like '%gtid_binlog_pos%'; Variable_name Value -gtid_binlog_pos 1111-1-2756 +gtid_binlog_pos 1111-1-1956 connection node_1; table size in node1 SELECT COUNT(*) FROM t1; COUNT(*) -2750 +1950 connection node_2; table size in node2 SELECT COUNT(*) FROM t1; COUNT(*) -2750 +1950 connection node_3; table size in node3 SELECT COUNT(*) FROM t1; COUNT(*) -2750 +1950 connection node_2; call mtr.add_suppression("WSREP: Ignoring server id for non bootstrap node"); call mtr.add_suppression("Sending JOIN failed: "); diff --git a/mysql-test/suite/galera_3nodes/t/galera_gtid_consistency.cnf b/mysql-test/suite/galera_3nodes/t/galera_gtid_consistency.cnf index c27490faf36..c0acbe58ad3 100644 --- a/mysql-test/suite/galera_3nodes/t/galera_gtid_consistency.cnf +++ b/mysql-test/suite/galera_3nodes/t/galera_gtid_consistency.cnf @@ -1,38 +1,31 @@ !include ../galera_3nodes.cnf +[mysqld] +loose-galera-gtid-consistency=1 +wsrep_sst_auth="root:" +wsrep_sst_method=mariabackup +log_slave_updates=ON +log_bin=mariadb-bin-log +binlog-format=row +wsrep-gtid-mode=ON +wsrep-debug=1 +gtid-strict-mode=1 + [mysqld.1] wsrep-node-name="node1" -wsrep_gtid_domain_id=1111 gtid_domain_id=2 server_id=10999 -wsrep_sst_auth="root:" -wsrep_sst_method=mariabackup -log_slave_updates=ON -log_bin=mariadb-bin-log -binlog-format=row -wsrep-gtid-mode=ON +wsrep_gtid_domain_id=1111 [mysqld.2] wsrep-node-name="node2" -wsrep_gtid_domain_id=1112 gtid_domain_id=3 -wsrep_sst_auth="root:" -wsrep_sst_method=mariabackup -log_slave_updates=ON -log_bin=mariadb-bin-log -binlog-format=row -wsrep-gtid-mode=ON +wsrep_gtid_domain_id=1112 [mysqld.3] wsrep-node-name="node3" -wsrep_gtid_domain_id=1113 gtid_domain_id=4 -wsrep_sst_auth="root:" -wsrep_sst_method=mariabackup -log_slave_updates=ON -log_bin=mariadb-bin-log -binlog-format=row -wsrep-gtid-mode=ON +wsrep_gtid_domain_id=1113 [sst] transferfmt=@ENV.MTR_GALERA_TFMT diff --git a/mysql-test/suite/galera_3nodes/t/galera_gtid_consistency.test b/mysql-test/suite/galera_3nodes/t/galera_gtid_consistency.test index f00972b0461..6a160720398 100644 --- a/mysql-test/suite/galera_3nodes/t/galera_gtid_consistency.test +++ b/mysql-test/suite/galera_3nodes/t/galera_gtid_consistency.test @@ -13,6 +13,13 @@ # from the bootstrap node (node_1), and use it # --connect node_3, 127.0.0.1, root, , test, $NODE_MYPORT_3 + +# Save original auto_increment_offset values. +--let $node_1=node_1 +--let $node_2=node_2 +--let $node_3=node_3 +--source ../galera/include/auto_increment_offset_save.inc + --connect node_2b, 127.0.0.1, root, , test, $NODE_MYPORT_2 set wsrep_sync_wait=0; --connect node_1b, 127.0.0.1, root, , test, $NODE_MYPORT_1 @@ -98,10 +105,10 @@ show variables like '%gtid_binlog_pos%'; # while node 2 is absent # --connection node_1 -CALL insert_row('node1', 500); +CALL insert_row('node1', 100); --connection node_3 -CALL insert_row('node3', 500); +CALL insert_row('node3', 100); CREATE TABLE t2(i int primary key) engine=innodb; @@ -225,12 +232,19 @@ show variables like '%gtid_binlog_pos%'; # bootstap cluster in order node1 - node2 - node3 # send some inserts and DDL after each node started # ---sleep 5 + --echo # Bootstrap from node_1 --connection node_1 --let $restart_parameters = --wsrep_new_cluster --source include/start_mysqld.inc +--let $wait_condition = SELECT VARIABLE_VALUE = 'ON' FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_ready'; +--source include/wait_condition.inc +--let $wait_condition = SELECT VARIABLE_VALUE = 'Primary' FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_cluster_status'; +--source include/wait_condition.inc +--let $wait_condition = SELECT VARIABLE_VALUE = 'Synced' FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_local_state_comment' +--source include/wait_condition.inc + show variables like 'wsrep_gtid_domain_id'; show variables like '%gtid_binlog_pos%'; @@ -243,6 +257,13 @@ ANALYZE TABLE t2; --let $_expect_file_name= $MYSQLTEST_VARDIR/tmp/mysqld.2.expect --source include/start_mysqld.inc +--let $wait_condition = SELECT VARIABLE_VALUE = 'ON' FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_ready'; +--source include/wait_condition.inc +--let $wait_condition = SELECT VARIABLE_VALUE = 'Primary' FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_cluster_status'; +--source include/wait_condition.inc +--let $wait_condition = SELECT VARIABLE_VALUE = 'Synced' FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_local_state_comment' +--source include/wait_condition.inc + # # connection node_1b may not be functional anymore, after node was # shutdown, open node_1c for controlling node 1 state @@ -265,6 +286,14 @@ ALTER TABLE t2 ADD COLUMN (k int); --let $_expect_file_name= $MYSQLTEST_VARDIR/tmp/mysqld.3.expect --source include/start_mysqld.inc +--let $wait_condition = SELECT VARIABLE_VALUE = 'ON' FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_ready'; +--source include/wait_condition.inc +--let $wait_condition = SELECT VARIABLE_VALUE = 'Primary' FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_cluster_status'; +--source include/wait_condition.inc +--let $wait_condition = SELECT VARIABLE_VALUE = 'Synced' FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_local_state_comment' +--source include/wait_condition.inc + + --connection node_1c --echo # wait until all nodes are back in cluster --let $wait_condition = SELECT VARIABLE_VALUE = 3 FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_cluster_size'; @@ -335,13 +364,18 @@ DROP TABLE t2; DROP TABLE t3; --connection node_3 ---let $wait_condition = SELECT COUNT(*) = 0 FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_NAME = 't2' +--let $wait_condition = SELECT COUNT(*) = 0 FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_NAME = 't3' --source include/wait_condition.inc --connection node_2 ---let $wait_condition = SELECT COUNT(*) = 0 FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_NAME = 't2' +--let $wait_condition = SELECT COUNT(*) = 0 FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_NAME = 't3' --source include/wait_condition.inc +# Restore original auto_increment_offset values. +--let $galera_cluster_size=3 +--source ../galera/include/auto_increment_offset_restore.inc + --disconnect node_3 --disconnect node_2b --disconnect node_1b --disconnect node_1c + From 7e748d075b7dc8aef36b1155cc43253d6a8a8ff2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Lindstr=C3=B6m?= Date: Fri, 30 Aug 2024 08:32:10 +0300 Subject: [PATCH 12/21] MDEV-34841 : Enable working Galera tests * Fixes galera.galera_bf_kill_debug test case. * Enable galera_ssl_upgrade, galera_ssl_reload, galera_pc_bootstrap * Add MDEV to disabled tests that miss it Signed-off-by: Julius Goryavsky --- mysql-test/suite/galera/t/galera_ssl_upgrade.cnf | 4 ++++ mysql-test/suite/galera/t/galera_ssl_upgrade.test | 2 ++ mysql-test/suite/galera_3nodes/disabled.def | 5 +++-- 3 files changed, 9 insertions(+), 2 deletions(-) diff --git a/mysql-test/suite/galera/t/galera_ssl_upgrade.cnf b/mysql-test/suite/galera/t/galera_ssl_upgrade.cnf index 2954ae0f4cb..7c495102564 100644 --- a/mysql-test/suite/galera/t/galera_ssl_upgrade.cnf +++ b/mysql-test/suite/galera/t/galera_ssl_upgrade.cnf @@ -1,5 +1,9 @@ !include ../galera_2nodes.cnf +[mysqld] +loose-galera-ssl-upgrade=1 +wsrep-debug=1 + [mysqld.1] wsrep_provider_options='base_port=@mysqld.1.#galera_port;socket.ssl=yes;socket.ssl_cert=@ENV.MYSQL_TEST_DIR/std_data/galera-cert.pem;socket.ssl_key=@ENV.MYSQL_TEST_DIR/std_data/galera-key.pem' diff --git a/mysql-test/suite/galera/t/galera_ssl_upgrade.test b/mysql-test/suite/galera/t/galera_ssl_upgrade.test index c09615527fd..78897ffd738 100644 --- a/mysql-test/suite/galera/t/galera_ssl_upgrade.test +++ b/mysql-test/suite/galera/t/galera_ssl_upgrade.test @@ -7,6 +7,8 @@ --source include/galera_cluster.inc --source include/have_innodb.inc --source include/have_ssl_communication.inc +--source include/have_openssl.inc +--source include/force_restart.inc # Save original auto_increment_offset values. --let $node_1=node_1 diff --git a/mysql-test/suite/galera_3nodes/disabled.def b/mysql-test/suite/galera_3nodes/disabled.def index b7f0d8da8c1..c52df019c65 100644 --- a/mysql-test/suite/galera_3nodes/disabled.def +++ b/mysql-test/suite/galera_3nodes/disabled.def @@ -11,5 +11,6 @@ ############################################################################## galera_2_cluster : MDEV-32631 galera_2_cluster: before_rollback(): Assertion `0' failed -galera_pc_bootstrap : temporarily disabled at the request of Codership -galera_ipv6_mariabackup_section : temporarily disabled at the request of Codership +galera_ipv6_rsync : MDEV-34842 Can't connect to server on '::1' (115) +galera_ipv6_rsync_section : MDEV-34842 Can't connect to server on '::1' (115) +galera_ipv6_mariabackup_section : MDEV-34842 Can't connect to server on '::1' (115) From 1c48950e1f82829c6a21cd0b54d5011959a54755 Mon Sep 17 00:00:00 2001 From: Denis Protivensky Date: Wed, 19 Jun 2024 14:08:13 +0300 Subject: [PATCH 13/21] MDEV-30536: Fix Galera bulk insert optimization MTR test After closing https://github.com/codership/galera-bugs/issues/947, Galera now correctly certifies table-level keys, which made bulk insert work again. The corresponding MTR test is made deterministic and re-enabled. Requires Galera 26.4.19 Signed-off-by: Julius Goryavsky --- mysql-test/suite/galera/disabled.def | 1 - .../suite/galera/r/galera_insert_bulk.result | 19 +++++++- .../suite/galera/t/galera_insert_bulk.test | 43 +++++++++++++++++-- 3 files changed, 57 insertions(+), 6 deletions(-) diff --git a/mysql-test/suite/galera/disabled.def b/mysql-test/suite/galera/disabled.def index 447d30c458d..0c15ec63f2c 100644 --- a/mysql-test/suite/galera/disabled.def +++ b/mysql-test/suite/galera/disabled.def @@ -14,7 +14,6 @@ galera_as_slave_ctas : MDEV-28378 timeout galera_pc_recovery : MDEV-25199 cluster fails to start up galera_bf_kill_debug : timeout after 900 seconds galera_ssl_upgrade : [Warning] Failed to load slave replication state from table mysql.gtid_slave_pos: 130: Incorrect file format 'gtid_slave_pos' -galera_insert_bulk : MDEV-30536 no expected deadlock in galera_insert_bulk test galera_sequences : MDEV-32561 WSREP FSM failure: no such a transition REPLICATING -> COMMITTED galera_concurrent_ctas : MDEV-32779 galera_concurrent_ctas: assertion in the galera::ReplicatorSMM::finish_cert() galera_as_slave_replay : MDEV-32780 galera_as_slave_replay: assertion in the wsrep::transaction::before_rollback() diff --git a/mysql-test/suite/galera/r/galera_insert_bulk.result b/mysql-test/suite/galera/r/galera_insert_bulk.result index f4d4adf64e1..7191464ba64 100644 --- a/mysql-test/suite/galera/r/galera_insert_bulk.result +++ b/mysql-test/suite/galera/r/galera_insert_bulk.result @@ -2,6 +2,8 @@ connection node_2; connection node_1; connection node_1; CREATE TABLE t1 (f1 INTEGER PRIMARY KEY) ENGINE=InnoDB; +SET GLOBAL DEBUG_DBUG = 'd,sync.wsrep_apply_cb'; +connection node_1; SET foreign_key_checks = 0; SET unique_checks = 0; START TRANSACTION; @@ -10,11 +12,20 @@ SET foreign_key_checks = 1; SET unique_checks = 1; INSERT INTO t1 VALUES (1001); connection node_1; +SET DEBUG_SYNC = 'wsrep_before_certification WAIT_FOR sync.wsrep_apply_cb_reached'; +SET DEBUG_SYNC = 'wsrep_after_certification SIGNAL signal.wsrep_apply_cb'; COMMIT; ERROR 40001: Deadlock found when trying to get lock; try restarting transaction DROP TABLE t1; +SET GLOBAL DEBUG_DBUG = ''; +SET DEBUG_SYNC = 'RESET'; connection node_1; CREATE TABLE t1 (f1 INTEGER PRIMARY KEY) ENGINE=InnoDB; +connection node_2; +SET GLOBAL DEBUG_DBUG = 'd,sync.wsrep_apply_cb'; +connection node_1; +SET foreign_key_checks = 0; +SET unique_checks = 0; START TRANSACTION; connection node_2; SET foreign_key_checks = 1; @@ -23,8 +34,14 @@ START TRANSACTION; INSERT INTO t1 VALUES (1001); connection node_1; COMMIT; -2 +3 connection node_2; +SET DEBUG_SYNC = 'wsrep_before_certification WAIT_FOR sync.wsrep_apply_cb_reached'; +SET DEBUG_SYNC = 'wsrep_after_certification SIGNAL signal.wsrep_apply_cb'; COMMIT; ERROR 40001: Deadlock found when trying to get lock; try restarting transaction DROP TABLE t1; +SET GLOBAL DEBUG_DBUG = ''; +SET DEBUG_SYNC = 'RESET'; +disconnect node_2; +disconnect node_1; diff --git a/mysql-test/suite/galera/t/galera_insert_bulk.test b/mysql-test/suite/galera/t/galera_insert_bulk.test index f58870d5f74..7faf8356420 100644 --- a/mysql-test/suite/galera/t/galera_insert_bulk.test +++ b/mysql-test/suite/galera/t/galera_insert_bulk.test @@ -5,6 +5,8 @@ --source include/galera_cluster.inc --source include/have_innodb.inc +--source include/have_debug_sync.inc +--source include/have_debug.inc # # Make bulk insert BF-abort, but regular insert succeed. @@ -13,6 +15,10 @@ --connection node_1 CREATE TABLE t1 (f1 INTEGER PRIMARY KEY) ENGINE=InnoDB; +# Delay applying of the single insert from the other node. +SET GLOBAL DEBUG_DBUG = 'd,sync.wsrep_apply_cb'; + +--connection node_1 # Disable foreign and unique key checks to allow bulk insert. SET foreign_key_checks = 0; SET unique_checks = 0; @@ -30,7 +36,7 @@ while ($count < 1000) --connection node_2 -# Disable bulk insert. +# Disable bulk insert on this node. SET foreign_key_checks = 1; SET unique_checks = 1; @@ -38,10 +44,20 @@ SET unique_checks = 1; INSERT INTO t1 VALUES (1001); --connection node_1 + +# We need to trigger Galera-level certification conflict. For this: +# - start applying single insert from the other node before bulk insert certifies +# - certifying bulk insert will lead to the conflict +# - keep applying single insert +SET DEBUG_SYNC = 'wsrep_before_certification WAIT_FOR sync.wsrep_apply_cb_reached'; +SET DEBUG_SYNC = 'wsrep_after_certification SIGNAL signal.wsrep_apply_cb'; + --error ER_LOCK_DEADLOCK COMMIT; DROP TABLE t1; +SET GLOBAL DEBUG_DBUG = ''; +SET DEBUG_SYNC = 'RESET'; # # Make bulk insert succeed, but regular insert BF-abort. @@ -50,8 +66,17 @@ DROP TABLE t1; --connection node_1 CREATE TABLE t1 (f1 INTEGER PRIMARY KEY) ENGINE=InnoDB; +--connection node_2 +# Delay applying of the bulk insert from the other node. +SET GLOBAL DEBUG_DBUG = 'd,sync.wsrep_apply_cb'; + +--connection node_1 --let $before_bulk_keys = `SELECT VARIABLE_VALUE FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_repl_keys'` +# Disable foreign and unique key checks to allow bulk insert. +SET foreign_key_checks = 0; +SET unique_checks = 0; + START TRANSACTION; --let $count=0 @@ -64,8 +89,7 @@ while ($count < 1000) --enable_query_log --connection node_2 - -# Disable bulk insert. +# Disable bulk insert on this node. SET foreign_key_checks = 1; SET unique_checks = 1; @@ -77,12 +101,23 @@ INSERT INTO t1 VALUES (1001); --connection node_1 COMMIT; -# Expect two keys to be added for bulk insert: DB-level shared key and table-level exclusive key. +# Expect three keys to be added for bulk insert: "zero-level" key, DB-level shared key and table-level exclusive key. --let $bulk_keys_count = `SELECT VARIABLE_VALUE - $before_bulk_keys FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_repl_keys'` --echo $bulk_keys_count --connection node_2 + +# We need to trigger Galera-level certification conflict. For this: +# - start applying bulk insert from the other node before local insert certifies +# - certifying local insert will lead to the conflict +# - keep applying bulk insert +SET DEBUG_SYNC = 'wsrep_before_certification WAIT_FOR sync.wsrep_apply_cb_reached'; +SET DEBUG_SYNC = 'wsrep_after_certification SIGNAL signal.wsrep_apply_cb'; + --error ER_LOCK_DEADLOCK COMMIT; DROP TABLE t1; +SET GLOBAL DEBUG_DBUG = ''; +SET DEBUG_SYNC = 'RESET'; +--source include/galera_end.inc From b1f75221701c3ddd5de25ebb2322b3a109ccf20a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Lindstr=C3=B6m?= Date: Fri, 30 Aug 2024 08:32:10 +0300 Subject: [PATCH 14/21] MDEV-34841 : Enable working Galera tests * Fixes galera.galera_bf_kill_debug test case. * Enable galera_ssl_upgrade, galera_ssl_reload, galera_pc_bootstrap * Add MDEV to disabled tests that miss it P.S. This commit contains additional changes compared to the similar commit for 10.5 branch. Signed-off-by: Julius Goryavsky --- mysql-test/suite/galera/disabled.def | 2 -- mysql-test/suite/galera/r/galera_bf_kill_debug.result | 7 +++++-- mysql-test/suite/galera/t/galera_bf_kill_debug.test | 9 +++++++-- 3 files changed, 12 insertions(+), 6 deletions(-) diff --git a/mysql-test/suite/galera/disabled.def b/mysql-test/suite/galera/disabled.def index 0c15ec63f2c..e00bdbf85c0 100644 --- a/mysql-test/suite/galera/disabled.def +++ b/mysql-test/suite/galera/disabled.def @@ -12,8 +12,6 @@ galera_as_slave_ctas : MDEV-28378 timeout galera_pc_recovery : MDEV-25199 cluster fails to start up -galera_bf_kill_debug : timeout after 900 seconds -galera_ssl_upgrade : [Warning] Failed to load slave replication state from table mysql.gtid_slave_pos: 130: Incorrect file format 'gtid_slave_pos' galera_sequences : MDEV-32561 WSREP FSM failure: no such a transition REPLICATING -> COMMITTED galera_concurrent_ctas : MDEV-32779 galera_concurrent_ctas: assertion in the galera::ReplicatorSMM::finish_cert() galera_as_slave_replay : MDEV-32780 galera_as_slave_replay: assertion in the wsrep::transaction::before_rollback() diff --git a/mysql-test/suite/galera/r/galera_bf_kill_debug.result b/mysql-test/suite/galera/r/galera_bf_kill_debug.result index c3eae243f47..52bd1b0e370 100644 --- a/mysql-test/suite/galera/r/galera_bf_kill_debug.result +++ b/mysql-test/suite/galera/r/galera_bf_kill_debug.result @@ -22,16 +22,19 @@ update t1 set b= 1 where a=1; connection node_2b; SET SESSION DEBUG_SYNC = "now WAIT_FOR sync.before_wsrep_thd_abort_reached"; connection node_2; -SET DEBUG_SYNC= 'before_awake_no_mutex SIGNAL awake_reached WAIT_FOR continue_kill'; +SET DEBUG_SYNC= 'wsrep_kill_before_awake_no_mutex SIGNAL awake_reached WAIT_FOR continue_kill'; connection node_2b; SET DEBUG_SYNC='now WAIT_FOR awake_reached'; SET GLOBAL debug_dbug = ""; SET DEBUG_SYNC = "now SIGNAL signal.before_wsrep_thd_abort"; +connection node_1; +COMMIT; +connection node_2b; SET DEBUG_SYNC = "now SIGNAL continue_kill"; connection node_2; connection node_2a; select * from t1; -connection node_2; +connection node_2b; SET DEBUG_SYNC = "RESET"; drop table t1; disconnect node_2a; diff --git a/mysql-test/suite/galera/t/galera_bf_kill_debug.test b/mysql-test/suite/galera/t/galera_bf_kill_debug.test index c322f283757..d24a6dd19ef 100644 --- a/mysql-test/suite/galera/t/galera_bf_kill_debug.test +++ b/mysql-test/suite/galera/t/galera_bf_kill_debug.test @@ -66,7 +66,7 @@ SET SESSION DEBUG_SYNC = "now WAIT_FOR sync.before_wsrep_thd_abort_reached"; # # pause KILL execution before awake # -SET DEBUG_SYNC= 'before_awake_no_mutex SIGNAL awake_reached WAIT_FOR continue_kill'; +SET DEBUG_SYNC= 'wsrep_kill_before_awake_no_mutex SIGNAL awake_reached WAIT_FOR continue_kill'; --disable_query_log --send_eval KILL $k_thread --enable_query_log @@ -78,6 +78,11 @@ SET DEBUG_SYNC='now WAIT_FOR awake_reached'; # release applier and KILL operator SET GLOBAL debug_dbug = ""; SET DEBUG_SYNC = "now SIGNAL signal.before_wsrep_thd_abort"; + +--connection node_1 +COMMIT; + +--connection node_2b SET DEBUG_SYNC = "now SIGNAL continue_kill"; --connection node_2 @@ -87,7 +92,7 @@ SET DEBUG_SYNC = "now SIGNAL continue_kill"; --error 0,1213,2013 select * from t1; ---connection node_2 +--connection node_2b SET DEBUG_SYNC = "RESET"; drop table t1; From a50a5e0f3b5e93ffa07ea13ef90a3a3356bba5d8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Lindstr=C3=B6m?= Date: Fri, 26 Jul 2024 09:04:30 +0300 Subject: [PATCH 15/21] MDEV-34647 : 'INSERT...SELECT' on MyISAM table suddenly replicated by Galera Replication of MyISAM and Aria DML is experimental and best effort only. Earlier change make INSERT SELECT on both MyISAM and Aria to replicate using TOI and STATEMENT replication. Replication should happen only if user has set needed wsrep_mode setting. Note: This commit contains additional changes compared to those already made for the 10.5 branch. + small refactoring after main fix. Signed-off-by: Julius Goryavsky --- mysql-test/suite/galera/r/MDEV-34647.result | 3 ++- mysql-test/suite/galera/r/mdev-22063.result | 4 ++++ mysql-test/suite/galera/t/MDEV-34647.test | 5 ++--- mysql-test/suite/galera/t/mdev-22063.test | 5 +++++ sql/sql_parse.cc | 5 ++++- 5 files changed, 17 insertions(+), 5 deletions(-) diff --git a/mysql-test/suite/galera/r/MDEV-34647.result b/mysql-test/suite/galera/r/MDEV-34647.result index 16a4e839f13..0333f14ece1 100644 --- a/mysql-test/suite/galera/r/MDEV-34647.result +++ b/mysql-test/suite/galera/r/MDEV-34647.result @@ -39,7 +39,7 @@ id val 7 d 9 d 11 d -set global wsrep_mode=REPLICATE_MYISAM; +set global wsrep_mode='REPLICATE_MYISAM,REPLICATE_ARIA'; create table t4(id serial, val varchar(100)) engine=myisam; insert into t4 values(null, 'a'); insert into t4 values(null, 'b'); @@ -95,6 +95,7 @@ id val 4 d 5 d 6 d +set global wsrep_mode=default; connection node_1; drop table t1,t2,t3,t4,t5; set global wsrep_mode=default; diff --git a/mysql-test/suite/galera/r/mdev-22063.result b/mysql-test/suite/galera/r/mdev-22063.result index 585d70acb61..228f63d6688 100644 --- a/mysql-test/suite/galera/r/mdev-22063.result +++ b/mysql-test/suite/galera/r/mdev-22063.result @@ -17,12 +17,14 @@ SELECT * FROM s; next_not_cached_value minimum_value maximum_value start_value increment cache_size cycle_option cycle_count 1 1 9223372036854775806 1 1 1000 0 0 connection node_2; +SET GLOBAL WSREP_MODE='REPLICATE_ARIA,REPLICATE_MYISAM'; SELECT * FROM t1; a SELECT * FROM s; next_not_cached_value minimum_value maximum_value start_value increment cache_size cycle_option cycle_count 1 1 9223372036854775806 1 1 1000 0 0 connection node_1; +SET GLOBAL WSREP_MODE='REPLICATE_ARIA,REPLICATE_MYISAM'; DROP TABLE t1; DROP SEQUENCE s; # Case 2 REPLACE INTO ... SELECT with error @@ -240,3 +242,5 @@ pk DROP TABLE t1; DROP VIEW view_t1; SET GLOBAL wsrep_mode=DEFAULT; +connection node_2; +SET GLOBAL wsrep_mode=DEFAULT; diff --git a/mysql-test/suite/galera/t/MDEV-34647.test b/mysql-test/suite/galera/t/MDEV-34647.test index 8840c233c6d..db776681aa1 100644 --- a/mysql-test/suite/galera/t/MDEV-34647.test +++ b/mysql-test/suite/galera/t/MDEV-34647.test @@ -22,7 +22,7 @@ insert into t3 select null, 'c'; insert into t3 select null, 'd' from t3; select * from t3; -set global wsrep_mode=REPLICATE_MYISAM; +set global wsrep_mode='REPLICATE_MYISAM,REPLICATE_ARIA'; create table t4(id serial, val varchar(100)) engine=myisam; insert into t4 values(null, 'a'); @@ -45,9 +45,8 @@ select * from t2; select * from t3; select * from t4; select * from t5; - +set global wsrep_mode=default; --connection node_1 drop table t1,t2,t3,t4,t5; set global wsrep_mode=default; - diff --git a/mysql-test/suite/galera/t/mdev-22063.test b/mysql-test/suite/galera/t/mdev-22063.test index ccc199e308f..260067d8a7b 100644 --- a/mysql-test/suite/galera/t/mdev-22063.test +++ b/mysql-test/suite/galera/t/mdev-22063.test @@ -16,6 +16,7 @@ SELECT * FROM t1; SELECT * FROM s; --connection node_2 +SET GLOBAL WSREP_MODE='REPLICATE_ARIA,REPLICATE_MYISAM'; --let $wait_condition = SELECT COUNT(*) = 1 FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_NAME = 't1' --source include/wait_condition.inc --let $wait_condition = SELECT COUNT(*) = 1 FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_NAME = 's' @@ -27,6 +28,7 @@ SELECT * FROM t1; SELECT * FROM s; --connection node_1 +SET GLOBAL WSREP_MODE='REPLICATE_ARIA,REPLICATE_MYISAM'; DROP TABLE t1; DROP SEQUENCE s; @@ -183,3 +185,6 @@ SELECT * FROM t1; DROP TABLE t1; DROP VIEW view_t1; SET GLOBAL wsrep_mode=DEFAULT; + +--connection node_2 +SET GLOBAL wsrep_mode=DEFAULT; diff --git a/sql/sql_parse.cc b/sql/sql_parse.cc index 9d432e7f821..3bf814d663f 100644 --- a/sql/sql_parse.cc +++ b/sql/sql_parse.cc @@ -4728,8 +4728,11 @@ mysql_execute_command(THD *thd, bool is_called_from_prepared_stmt) if (!wsrep_toi) { /* Currently we support TOI for MyISAM only. */ - if (db_type == DB_TYPE_MYISAM && wsrep_replicate_myisam) + if ((db_type == DB_TYPE_MYISAM && wsrep_check_mode(WSREP_MODE_REPLICATE_MYISAM)) || + (db_type == DB_TYPE_ARIA && wsrep_check_mode(WSREP_MODE_REPLICATE_ARIA))) + { WSREP_TO_ISOLATION_BEGIN(first_table->db.str, first_table->table_name.str, NULL); + } } } } From 72243bc23605ff0ba7409789e4fda1e43be83427 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Lindstr=C3=B6m?= Date: Wed, 14 Jun 2023 08:55:58 +0300 Subject: [PATCH 16/21] MDEV-31173 : Server crashes when setting wsrep_cluster_address after adding invalid value to wsrep_allowlist table Problem was that wsrep_schema tables were not marked as category information. Fix allows access to wsrep_schema tables even when node is detached. Signed-off-by: Julius Goryavsky --- .../suite/galera/r/galera_wsrep_schema_detached.result | 9 +++++++++ .../suite/galera/t/galera_wsrep_schema_detached.test | 6 ++++++ 2 files changed, 15 insertions(+) diff --git a/mysql-test/suite/galera/r/galera_wsrep_schema_detached.result b/mysql-test/suite/galera/r/galera_wsrep_schema_detached.result index 493bf9d3cac..41275ede6d2 100644 --- a/mysql-test/suite/galera/r/galera_wsrep_schema_detached.result +++ b/mysql-test/suite/galera/r/galera_wsrep_schema_detached.result @@ -11,12 +11,21 @@ SET GLOBAL WSREP_ON=0; SELECT COUNT(*) AS EXPECT_0 FROM mysql.wsrep_streaming_log; EXPECT_0 0 +SELECT COUNT(*) AS EXPECT_0 FROM mysql.wsrep_allowlist; +EXPECT_0 +0 SELECT COUNT(*) AS EXPECT_1 FROM mysql.wsrep_cluster; EXPECT_1 1 SELECT COUNT(*) AS EXPECT_2 FROM mysql.wsrep_cluster_members; EXPECT_2 2 +INSERT INTO mysql.wsrep_allowlist (ip) VALUES (0); +SET GLOBAL wsrep_cluster_address = @wsrep_cluster_address_orig; +SELECT 1; +1 +1 +DELETE FROM mysql.wsrep_allowlist; connection node_1; SET GLOBAL wsrep_provider_options ='pc.ignore_sb=true'; connection node_2; diff --git a/mysql-test/suite/galera/t/galera_wsrep_schema_detached.test b/mysql-test/suite/galera/t/galera_wsrep_schema_detached.test index f67091a5fb9..9942d63f142 100644 --- a/mysql-test/suite/galera/t/galera_wsrep_schema_detached.test +++ b/mysql-test/suite/galera/t/galera_wsrep_schema_detached.test @@ -13,9 +13,15 @@ SET @wsrep_provider_options_orig = @@GLOBAL.wsrep_provider_options; SET @wsrep_cluster_address_orig = @@GLOBAL.wsrep_cluster_address; SET GLOBAL WSREP_ON=0; SELECT COUNT(*) AS EXPECT_0 FROM mysql.wsrep_streaming_log; +SELECT COUNT(*) AS EXPECT_0 FROM mysql.wsrep_allowlist; SELECT COUNT(*) AS EXPECT_1 FROM mysql.wsrep_cluster; SELECT COUNT(*) AS EXPECT_2 FROM mysql.wsrep_cluster_members; +INSERT INTO mysql.wsrep_allowlist (ip) VALUES (0); +SET GLOBAL wsrep_cluster_address = @wsrep_cluster_address_orig; +SELECT 1; +DELETE FROM mysql.wsrep_allowlist; + --connection node_1 SET GLOBAL wsrep_provider_options ='pc.ignore_sb=true'; From 819765a47dd1aab58fc1693ce7b19c1f3015b375 Mon Sep 17 00:00:00 2001 From: Sergei Petrunia Date: Fri, 9 Aug 2024 11:22:14 +0300 Subject: [PATCH 17/21] Code cleanup in test_if_skip_sort_order() Added comments Moved a part into find_indexes_matching_order(). --- sql/sql_select.cc | 112 +++++++++++++++++++++++++++++++++------------- 1 file changed, 80 insertions(+), 32 deletions(-) diff --git a/sql/sql_select.cc b/sql/sql_select.cc index d8958041160..1b8552a805e 100644 --- a/sql/sql_select.cc +++ b/sql/sql_select.cc @@ -330,6 +330,14 @@ static void optimize_rownum(THD *thd, SELECT_LEX_UNIT *unit, Item *cond); static bool process_direct_rownum_comparison(THD *thd, SELECT_LEX_UNIT *unit, Item *cond); +static +bool find_indexes_matching_order(JOIN *join, TABLE *table, ORDER *order, + key_map *usable_keys); +static +void compute_part_of_sort_key_for_equals(JOIN *join, TABLE *table, + Item_field *item_field, + key_map *col_keys); + #ifndef DBUG_OFF /* @@ -24633,6 +24641,7 @@ find_field_in_item_list (Field *field, void *data) that belong to 'table' and are equal to 'item_field'. */ +static void compute_part_of_sort_key_for_equals(JOIN *join, TABLE *table, Item_field *item_field, key_map *col_keys) @@ -24777,6 +24786,58 @@ static void prepare_for_reverse_ordered_access(JOIN_TAB *tab) } +/* + @brief + Given a table and order, find indexes that produce rows in the order + + @param usable_keys OUT Bitmap of indexes that produce rows in order. + + @return + false Some indexes were found + true No indexes found +*/ + +static +bool find_indexes_matching_order(JOIN *join, TABLE *table, ORDER *order, + key_map *usable_keys) +{ + /* Find indexes that cover all ORDER/GROUP BY fields */ + for (ORDER *tmp_order=order; tmp_order ; tmp_order=tmp_order->next) + { + Item *item= (*tmp_order->item)->real_item(); + if (item->type() != Item::FIELD_ITEM) + { + usable_keys->clear_all(); + return true; /* No suitable keys */ + } + + /* + Take multiple-equalities into account. Suppose we have + ORDER BY col1, col10 + and there are + multiple-equal(col1, col2, col3), + multiple-equal(col10, col11). + + Then, + - when item=col1, we find the set of indexes that cover one of {col1, + col2, col3} + - when item=col10, we find the set of indexes that cover one of {col10, + col11} + + And we compute an intersection of these sets to find set of indexes that + cover all ORDER BY components. + */ + key_map col_keys; + compute_part_of_sort_key_for_equals(join, table, (Item_field*)item, + &col_keys); + usable_keys->intersect(col_keys); + if (usable_keys->is_clear_all()) + return true; // No usable keys + } + return false; + +} + /** Test if we can skip the ORDER BY by using an index. @@ -24838,41 +24899,17 @@ test_if_skip_sort_order(JOIN_TAB *tab,ORDER *order,ha_rows select_limit, been taken into account. */ usable_keys= *map; - - /* Find indexes that cover all ORDER/GROUP BY fields */ - for (ORDER *tmp_order=order; tmp_order ; tmp_order=tmp_order->next) + + // Step #1: Find indexes that produce the required ordering. + if (find_indexes_matching_order(tab->join, table, order, &usable_keys)) { - Item *item= (*tmp_order->item)->real_item(); - if (item->type() != Item::FIELD_ITEM) - { - usable_keys.clear_all(); - DBUG_RETURN(0); - } - - /* - Take multiple-equalities into account. Suppose we have - ORDER BY col1, col10 - and there are - multiple-equal(col1, col2, col3), - multiple-equal(col10, col11). - - Then, - - when item=col1, we find the set of indexes that cover one of {col1, - col2, col3} - - when item=col10, we find the set of indexes that cover one of {col10, - col11} - - And we compute an intersection of these sets to find set of indexes that - cover all ORDER BY components. - */ - key_map col_keys; - compute_part_of_sort_key_for_equals(tab->join, table, (Item_field*)item, - &col_keys); - usable_keys.intersect(col_keys); - if (usable_keys.is_clear_all()) - goto use_filesort; // No usable keys + DBUG_RETURN(false); // Cannot skip sorting } + /* + Step #2: Analyze the current access method. Note the used index as ref_key + and #used keyparts in ref_key_parts. + */ ref_key= -1; /* Test if constant range in WHERE */ if (tab->ref.key >= 0 && tab->ref.key_parts) @@ -24916,6 +24953,12 @@ test_if_skip_sort_order(JOIN_TAB *tab,ORDER *order,ha_rows select_limit, } } + /* + Step #3: Check if index ref_key that we're using produces the required + ordering or if there is another index new_ref_key such that + - ref_key is a prefix of new_ref_key (so, access method can be reused) + - new_ref_key produces the required ordering + */ if (ref_key >= 0 && ref_key != MAX_KEY) { /* Current access method uses index ref_key with ref_key_parts parts */ @@ -25035,6 +25078,11 @@ test_if_skip_sort_order(JOIN_TAB *tab,ORDER *order,ha_rows select_limit, &used_key_parts))) goto check_reverse_order; } + + /* + Step #4: Go through all indexes that produce required ordering (in + usable_keys) and check if any of them is cheaper than ref_key + */ { uint UNINIT_VAR(best_key_parts); uint saved_best_key_parts= 0; From c8d040938a7ebe10e62506a726702c5990ef4dda Mon Sep 17 00:00:00 2001 From: Sergei Petrunia Date: Sun, 18 Aug 2024 20:10:19 +0300 Subject: [PATCH 18/21] MDEV-34720: Poor plan choice for large JOIN with ORDER BY and small LIMIT (Variant 2b: call greedy_search() twice, correct handling for limited search_depth) Modify the join optimizer to specifically try to produce join orders that can short-cut their execution for ORDER BY..LIMIT clause. The optimization is controlled by @@optimizer_join_limit_pref_ratio. Default value 0 means don't construct short-cutting join orders. Other value means construct short-cutting join order, and prefer it only if it promises speedup of more than #value times. In Optimizer Trace, look for these names: * join_limit_shortcut_is_applicable * join_limit_shortcut_plan_search * join_limit_shortcut_choice --- mysql-test/main/mysqld--help.result | 10 + mysql-test/main/order_by_limit_join.result | 461 ++++++++++++++++++ mysql-test/main/order_by_limit_join.test | 207 ++++++++ .../sys_vars/r/sysvars_server_embedded.result | 10 + .../r/sysvars_server_notembedded.result | 10 + sql/sql_class.h | 1 + sql/sql_select.cc | 394 ++++++++++++++- sql/sql_select.h | 14 + sql/sys_vars.cc | 14 + 9 files changed, 1110 insertions(+), 11 deletions(-) create mode 100644 mysql-test/main/order_by_limit_join.result create mode 100644 mysql-test/main/order_by_limit_join.test diff --git a/mysql-test/main/mysqld--help.result b/mysql-test/main/mysqld--help.result index cabf8037508..08cd1202655 100644 --- a/mysql-test/main/mysqld--help.result +++ b/mysql-test/main/mysqld--help.result @@ -724,6 +724,15 @@ The following specify which files/extra groups are read (specified before remain keys. This variable will be deleted in MariaDB 11.0 as it is not needed with the new 11.0 optimizer. Use 'ALL' to set all combinations. + --optimizer-join-limit-pref-ratio=# + For queries with JOIN and ORDER BY LIMIT : make the + optimizer consider a join order that allows to short-cut + execution after producing #LIMIT matches if that promises + N times speedup. (A conservative setting here would be is + a high value, like 100 so the short-cutting plan is used + if it promises a speedup of 100x or more). Short-cutting + plans are inherently risky so the default is 0 which + means do not consider this optimization --optimizer-max-sel-arg-weight=# The maximum weight of the SEL_ARG graph. Set to 0 for no limit @@ -1697,6 +1706,7 @@ old-mode UTF8_IS_UTF8MB3 old-passwords FALSE old-style-user-limits FALSE optimizer-adjust-secondary-key-costs +optimizer-join-limit-pref-ratio 0 optimizer-max-sel-arg-weight 32000 optimizer-max-sel-args 16000 optimizer-prune-level 1 diff --git a/mysql-test/main/order_by_limit_join.result b/mysql-test/main/order_by_limit_join.result new file mode 100644 index 00000000000..3f48aa3f65f --- /dev/null +++ b/mysql-test/main/order_by_limit_join.result @@ -0,0 +1,461 @@ +# +# MDEV-34720: Poor plan choice for large JOIN with ORDER BY and small LIMIT +# +create table t1 ( +a int, +b int, +c int, +col1 int, +col2 int, +index(a), +index(b), +index(col1) +); +insert into t1 select +mod(seq, 100), +mod(seq, 95), +seq, +seq, +seq +from +seq_1_to_10000; +create table t10 ( +a int, +a_value char(10), +key(a) +); +insert into t10 select seq, seq from seq_1_to_100; +create table t11 ( +b int, +b_value char(10), +key(b) +); +insert into t11 select seq, seq from seq_1_to_100; +set @tmp_os=@@optimizer_trace; +set optimizer_trace=1; +# +# Query 1 - basic example. +# +# Table t1 is not the first, have to use temporary+filesort: +explain +select +* +from +t1 +join t10 on t1.a=t10.a +join t11 on t1.b=t11.b +order by +t1.col1 +limit 10; +id select_type table type possible_keys key key_len ref rows Extra +1 SIMPLE t10 ALL a NULL NULL NULL 100 Using where; Using temporary; Using filesort +1 SIMPLE t1 ref a,b a 5 test.t10.a 100 Using where +1 SIMPLE t11 ref b b 5 test.t1.b 1 +set optimizer_join_limit_pref_ratio=10; +# t1 is first, key=col1 produces ordering, no filesort or temporary: +explain +select +* +from +t1 +join t10 on t1.a=t10.a +join t11 on t1.b=t11.b +order by +t1.col1 +limit 10; +id select_type table type possible_keys key key_len ref rows Extra +1 SIMPLE t1 index a,b col1 5 NULL 10 Using where +1 SIMPLE t10 ref a a 5 test.t1.a 1 +1 SIMPLE t11 ref b b 5 test.t1.b 1 +set @trace=(select trace from information_schema.optimizer_trace); +select json_detailed(json_extract(@trace, '$**.join_limit_shortcut_choice')) as JS; +JS +[ + { + "limit_fraction": 0.001, + "test_if_skip_sort_order_early": + [ + { + "reconsidering_access_paths_for_index_ordering": + { + "clause": "ORDER BY", + "fanout": 1, + "read_time": 53.27053125, + "table": "t1", + "rows_estimation": 10000, + "possible_keys": + [ + { + "index": "a", + "can_resolve_order": false, + "cause": "not usable index for the query" + }, + { + "index": "b", + "can_resolve_order": false, + "cause": "not usable index for the query" + }, + { + "index": "col1", + "can_resolve_order": true, + "updated_limit": 10, + "index_scan_time": 10, + "records": 10000, + "chosen": true + } + ] + } + } + ], + "can_skip_filesort": true, + "full_join_cost": 46064.98442, + "risk_ratio": 10, + "shortcut_join_cost": 97.28224614, + "shortcut_cost_with_risk": 972.8224614, + "use_shortcut_cost": true + } +] +# +# Query 2 - same as above but without a suitable index. +# +# Table t1 is not the first, have to use temporary+filesort: +set optimizer_join_limit_pref_ratio=0; +explain +select +* +from +t1 +join t10 on t1.a=t10.a +join t11 on t1.b=t11.b +order by +t1.col2 +limit 10; +id select_type table type possible_keys key key_len ref rows Extra +1 SIMPLE t10 ALL a NULL NULL NULL 100 Using where; Using temporary; Using filesort +1 SIMPLE t1 ref a,b a 5 test.t10.a 100 Using where +1 SIMPLE t11 ref b b 5 test.t1.b 1 +# t1 is first but there's no suitable index, +# so we use filesort but using temporary: +set optimizer_join_limit_pref_ratio=10; +explain +select +* +from +t1 +join t10 on t1.a=t10.a +join t11 on t1.b=t11.b +order by +t1.col2 +limit 10; +id select_type table type possible_keys key key_len ref rows Extra +1 SIMPLE t1 ALL a,b NULL NULL NULL 10000 Using where; Using filesort +1 SIMPLE t10 ref a a 5 test.t1.a 1 +1 SIMPLE t11 ref b b 5 test.t1.b 1 +set @trace=(select trace from information_schema.optimizer_trace); +select json_detailed(json_extract(@trace, '$**.join_limit_shortcut_choice')) as JS; +JS +[ + { + "limit_fraction": 0.001, + "test_if_skip_sort_order_early": + [], + "can_skip_filesort": false, + "full_join_cost": 46064.98442, + "risk_ratio": 10, + "shortcut_join_cost": 2097.281246, + "shortcut_cost_with_risk": 20972.81246, + "use_shortcut_cost": true + } +] +# +# Query 3: Counter example with large limit +# +# Table t1 is not the first, have to use temporary+filesort: +set optimizer_join_limit_pref_ratio=0; +explain +select +* +from +t1 +join t10 on t1.a=t10.a +join t11 on t1.b=t11.b +order by +t1.col1 +limit 5000; +id select_type table type possible_keys key key_len ref rows Extra +1 SIMPLE t10 ALL a NULL NULL NULL 100 Using where; Using temporary; Using filesort +1 SIMPLE t1 ref a,b a 5 test.t10.a 100 Using where +1 SIMPLE t11 ref b b 5 test.t1.b 1 +# Same plan as above: +# Table t1 is not the first, have to use temporary+filesort: +set optimizer_join_limit_pref_ratio=10; +explain +select +* +from +t1 +join t10 on t1.a=t10.a +join t11 on t1.b=t11.b +order by +t1.col1 +limit 5000; +id select_type table type possible_keys key key_len ref rows Extra +1 SIMPLE t10 ALL a NULL NULL NULL 100 Using where; Using temporary; Using filesort +1 SIMPLE t1 ref a,b a 5 test.t10.a 100 Using where +1 SIMPLE t11 ref b b 5 test.t1.b 1 +set @trace=(select trace from information_schema.optimizer_trace); +select json_detailed(json_extract(@trace, '$**.join_limit_shortcut_choice')) as JS; +JS +[ + { + "limit_fraction": 0.5, + "test_if_skip_sort_order_early": + [ + { + "reconsidering_access_paths_for_index_ordering": + { + "clause": "ORDER BY", + "fanout": 1, + "read_time": 53.27053125, + "table": "t1", + "rows_estimation": 10000, + "possible_keys": + [ + { + "index": "a", + "can_resolve_order": false, + "cause": "not usable index for the query" + }, + { + "index": "b", + "can_resolve_order": false, + "cause": "not usable index for the query" + }, + { + "index": "col1", + "can_resolve_order": true, + "updated_limit": 5000, + "index_scan_time": 5000, + "usable": false, + "cause": "cost" + } + ] + } + } + ], + "can_skip_filesort": false, + "full_join_cost": 46064.98442, + "risk_ratio": 10, + "shortcut_join_cost": 24059.12698, + "shortcut_cost_with_risk": 240591.2698, + "use_shortcut_cost": false + } +] +# +# Query 4: LEFT JOIN makes it impossible to put ORDER-BY-table first, +# however the optimizer still puts it as sort_by_table. +# +set optimizer_join_limit_pref_ratio=10; +explain +select +* +from +t10 left join (t1 join t11 on t1.b=t11.b ) on t1.a=t10.a +order by +t1.col2 +limit 10; +id select_type table type possible_keys key key_len ref rows Extra +1 SIMPLE t10 ALL NULL NULL NULL NULL 100 Using temporary; Using filesort +1 SIMPLE t1 ref a,b a 5 test.t10.a 100 Using where +1 SIMPLE t11 ref b b 5 test.t1.b 1 +set @trace=(select trace from information_schema.optimizer_trace); +# This will show nothing as limit shortcut code figures that +# it's not possible to use t1 to construct shortcuts: +select json_detailed(json_extract(@trace, '$**.join_limit_shortcut_choice')) as JS; +JS +NULL +# +# Query 5: Same as Q1 but also with a semi-join +# +set optimizer_join_limit_pref_ratio=default; +# Table t1 is not the first, have to use temporary+filesort: +explain +select +* +from +t1 +join t10 on t1.a=t10.a +join t11 on t1.b=t11.b +where +t1.a in (select a from t10) and +t1.b in (select b from t11) +order by +t1.col1 +limit 10; +id select_type table type possible_keys key key_len ref rows Extra +1 PRIMARY t10 ALL a NULL NULL NULL 100 Using where; Using temporary; Using filesort +1 PRIMARY t10 ref a a 5 test.t10.a 1 Using index; LooseScan +1 PRIMARY t1 ref a,b a 5 test.t10.a 100 Using where +1 PRIMARY t11 ref b b 5 test.t1.b 1 +1 PRIMARY eq_ref distinct_key distinct_key 4 func 1 +3 MATERIALIZED t11 index b b 5 NULL 100 Using index +set optimizer_join_limit_pref_ratio=10; +# t1 is first, key=col1 produces ordering, no filesort or temporary: +explain +select +* +from +t1 +join t10 on t1.a=t10.a +join t11 on t1.b=t11.b +where +t1.a in (select a from t10) and +t1.b in (select b from t11) +order by +t1.col1 +limit 10; +id select_type table type possible_keys key key_len ref rows Extra +1 PRIMARY t1 index a,b col1 5 NULL 1 Using where +1 PRIMARY t10 ref a a 5 test.t1.a 1 +1 PRIMARY t11 ref b b 5 test.t1.b 1 +1 PRIMARY eq_ref distinct_key distinct_key 4 func 1 +1 PRIMARY eq_ref distinct_key distinct_key 4 func 1 +3 MATERIALIZED t11 index b b 5 NULL 100 Using index +2 MATERIALIZED t10 index a a 5 NULL 100 Using index +set @trace=(select trace from information_schema.optimizer_trace); +select json_detailed(json_extract(@trace, '$**.join_limit_shortcut_choice')) as JS; +JS +[ + { + "limit_fraction": 0.001, + "test_if_skip_sort_order_early": + [ + { + "reconsidering_access_paths_for_index_ordering": + { + "clause": "ORDER BY", + "fanout": 1, + "read_time": 53.27053125, + "table": "t1", + "rows_estimation": 10000, + "possible_keys": + [ + { + "index": "a", + "can_resolve_order": false, + "cause": "not usable index for the query" + }, + { + "index": "b", + "can_resolve_order": false, + "cause": "not usable index for the query" + }, + { + "index": "col1", + "can_resolve_order": true, + "updated_limit": 10, + "index_scan_time": 10, + "records": 10000, + "chosen": true + } + ] + } + } + ], + "can_skip_filesort": true, + "full_join_cost": 47079.71684, + "risk_ratio": 10, + "shortcut_join_cost": 98.29697856, + "shortcut_cost_with_risk": 982.9697856, + "use_shortcut_cost": true + } +] +# +# Query 6: same as Query 1 but let's limit the search depth +# +set @tmp_osd=@@optimizer_search_depth; +set optimizer_search_depth=1; +set optimizer_join_limit_pref_ratio=default; +# Table t1 is not the first, have to use temporary+filesort: +explain +select +* +from +t1 +join t10 on t1.a=t10.a +join t11 on t1.b=t11.b +order by +t1.col1 +limit 10; +id select_type table type possible_keys key key_len ref rows Extra +1 SIMPLE t10 ALL a NULL NULL NULL 100 Using where; Using temporary; Using filesort +1 SIMPLE t11 ALL b NULL NULL NULL 100 Using join buffer (flat, BNL join) +1 SIMPLE t1 ref a,b a 5 test.t10.a 100 Using where +set optimizer_join_limit_pref_ratio=10; +# t1 is first, key=col1 produces ordering, no filesort or temporary: +explain +select +* +from +t1 +join t10 on t1.a=t10.a +join t11 on t1.b=t11.b +order by +t1.col1 +limit 10; +id select_type table type possible_keys key key_len ref rows Extra +1 SIMPLE t1 index a,b col1 5 NULL 10 Using where +1 SIMPLE t10 ref a a 5 test.t1.a 1 +1 SIMPLE t11 ref b b 5 test.t1.b 1 +set @trace=(select trace from information_schema.optimizer_trace); +select json_detailed(json_extract(@trace, '$**.join_limit_shortcut_choice')) as JS; +JS +[ + { + "limit_fraction": 0.001, + "test_if_skip_sort_order_early": + [ + { + "reconsidering_access_paths_for_index_ordering": + { + "clause": "ORDER BY", + "fanout": 1, + "read_time": 53.27053125, + "table": "t1", + "rows_estimation": 10000, + "possible_keys": + [ + { + "index": "a", + "can_resolve_order": false, + "cause": "not usable index for the query" + }, + { + "index": "b", + "can_resolve_order": false, + "cause": "not usable index for the query" + }, + { + "index": "col1", + "can_resolve_order": true, + "updated_limit": 10, + "index_scan_time": 10, + "records": 10000, + "chosen": true + } + ] + } + } + ], + "can_skip_filesort": true, + "full_join_cost": 46064.98442, + "risk_ratio": 10, + "shortcut_join_cost": 97.28224614, + "shortcut_cost_with_risk": 972.8224614, + "use_shortcut_cost": true + } +] +set optimizer_search_depth=@tmp_osd; +set optimizer_trace=@tmp_os; +set optimizer_join_limit_pref_ratio=default; +drop table t1, t10, t11; diff --git a/mysql-test/main/order_by_limit_join.test b/mysql-test/main/order_by_limit_join.test new file mode 100644 index 00000000000..da05cdc30c3 --- /dev/null +++ b/mysql-test/main/order_by_limit_join.test @@ -0,0 +1,207 @@ +--echo # +--echo # MDEV-34720: Poor plan choice for large JOIN with ORDER BY and small LIMIT +--echo # + +--source include/have_sequence.inc + +# We need optimizer trace +--source include/not_embedded.inc + +create table t1 ( + a int, + b int, + c int, + col1 int, + col2 int, + index(a), + index(b), + index(col1) +); + +insert into t1 select + mod(seq, 100), + mod(seq, 95), + seq, + seq, + seq +from + seq_1_to_10000; + + +create table t10 ( + a int, + a_value char(10), + key(a) +); +insert into t10 select seq, seq from seq_1_to_100; + +create table t11 ( + b int, + b_value char(10), + key(b) +); +insert into t11 select seq, seq from seq_1_to_100; + +set @tmp_os=@@optimizer_trace; +set optimizer_trace=1; + +--echo # +--echo # Query 1 - basic example. +--echo # +let $query= explain +select + * +from + t1 + join t10 on t1.a=t10.a + join t11 on t1.b=t11.b +order by + t1.col1 +limit 10; + +--echo # Table t1 is not the first, have to use temporary+filesort: +eval $query; + +set optimizer_join_limit_pref_ratio=10; + +--echo # t1 is first, key=col1 produces ordering, no filesort or temporary: +eval $query; + +set @trace=(select trace from information_schema.optimizer_trace); +select json_detailed(json_extract(@trace, '$**.join_limit_shortcut_choice')) as JS; + +--echo # +--echo # Query 2 - same as above but without a suitable index. +--echo # +let $query= +explain +select + * +from + t1 + join t10 on t1.a=t10.a + join t11 on t1.b=t11.b +order by + t1.col2 +limit 10; + +--echo # Table t1 is not the first, have to use temporary+filesort: +set optimizer_join_limit_pref_ratio=0; +eval $query; + +--echo # t1 is first but there's no suitable index, +--echo # so we use filesort but using temporary: +set optimizer_join_limit_pref_ratio=10; +eval $query; + +set @trace=(select trace from information_schema.optimizer_trace); +select json_detailed(json_extract(@trace, '$**.join_limit_shortcut_choice')) as JS; + +--echo # +--echo # Query 3: Counter example with large limit +--echo # +let $query= explain +select + * +from + t1 + join t10 on t1.a=t10.a + join t11 on t1.b=t11.b +order by + t1.col1 +limit 5000; + +--echo # Table t1 is not the first, have to use temporary+filesort: +set optimizer_join_limit_pref_ratio=0; +eval $query; + +--echo # Same plan as above: +--echo # Table t1 is not the first, have to use temporary+filesort: +set optimizer_join_limit_pref_ratio=10; +eval $query; + +set @trace=(select trace from information_schema.optimizer_trace); +select json_detailed(json_extract(@trace, '$**.join_limit_shortcut_choice')) as JS; + +--echo # +--echo # Query 4: LEFT JOIN makes it impossible to put ORDER-BY-table first, +--echo # however the optimizer still puts it as sort_by_table. +--echo # +set optimizer_join_limit_pref_ratio=10; +explain +select + * +from + t10 left join (t1 join t11 on t1.b=t11.b ) on t1.a=t10.a +order by + t1.col2 +limit 10; + +set @trace=(select trace from information_schema.optimizer_trace); +--echo # This will show nothing as limit shortcut code figures that +--echo # it's not possible to use t1 to construct shortcuts: +select json_detailed(json_extract(@trace, '$**.join_limit_shortcut_choice')) as JS; + +--echo # +--echo # Query 5: Same as Q1 but also with a semi-join +--echo # +set optimizer_join_limit_pref_ratio=default; +let $query= explain +select + * +from + t1 + join t10 on t1.a=t10.a + join t11 on t1.b=t11.b +where + t1.a in (select a from t10) and + t1.b in (select b from t11) +order by + t1.col1 +limit 10; + +--echo # Table t1 is not the first, have to use temporary+filesort: +eval $query; + +set optimizer_join_limit_pref_ratio=10; + +--echo # t1 is first, key=col1 produces ordering, no filesort or temporary: +eval $query; + +set @trace=(select trace from information_schema.optimizer_trace); +select json_detailed(json_extract(@trace, '$**.join_limit_shortcut_choice')) as JS; + +--echo # +--echo # Query 6: same as Query 1 but let's limit the search depth +--echo # +set @tmp_osd=@@optimizer_search_depth; +set optimizer_search_depth=1; +let $query= explain +select + * +from + t1 + join t10 on t1.a=t10.a + join t11 on t1.b=t11.b +order by + t1.col1 +limit 10; + +set optimizer_join_limit_pref_ratio=default; +--echo # Table t1 is not the first, have to use temporary+filesort: +eval $query; + +set optimizer_join_limit_pref_ratio=10; + +--echo # t1 is first, key=col1 produces ordering, no filesort or temporary: +eval $query; + +set @trace=(select trace from information_schema.optimizer_trace); +select json_detailed(json_extract(@trace, '$**.join_limit_shortcut_choice')) as JS; + + +set optimizer_search_depth=@tmp_osd; +set optimizer_trace=@tmp_os; +set optimizer_join_limit_pref_ratio=default; +drop table t1, t10, t11; + diff --git a/mysql-test/suite/sys_vars/r/sysvars_server_embedded.result b/mysql-test/suite/sys_vars/r/sysvars_server_embedded.result index b5c47ac5ecd..42b661bc448 100644 --- a/mysql-test/suite/sys_vars/r/sysvars_server_embedded.result +++ b/mysql-test/suite/sys_vars/r/sysvars_server_embedded.result @@ -2282,6 +2282,16 @@ NUMERIC_BLOCK_SIZE NULL ENUM_VALUE_LIST adjust_secondary_key_cost,disable_max_seek,disable_forced_index_in_group_by,fix_innodb_cardinality READ_ONLY NO COMMAND_LINE_ARGUMENT REQUIRED +VARIABLE_NAME OPTIMIZER_JOIN_LIMIT_PREF_RATIO +VARIABLE_SCOPE SESSION +VARIABLE_TYPE BIGINT UNSIGNED +VARIABLE_COMMENT For queries with JOIN and ORDER BY LIMIT : make the optimizer consider a join order that allows to short-cut execution after producing #LIMIT matches if that promises N times speedup. (A conservative setting here would be is a high value, like 100 so the short-cutting plan is used if it promises a speedup of 100x or more). Short-cutting plans are inherently risky so the default is 0 which means do not consider this optimization +NUMERIC_MIN_VALUE 0 +NUMERIC_MAX_VALUE 4294967295 +NUMERIC_BLOCK_SIZE 1 +ENUM_VALUE_LIST NULL +READ_ONLY NO +COMMAND_LINE_ARGUMENT REQUIRED VARIABLE_NAME OPTIMIZER_MAX_SEL_ARGS VARIABLE_SCOPE SESSION VARIABLE_TYPE BIGINT UNSIGNED diff --git a/mysql-test/suite/sys_vars/r/sysvars_server_notembedded.result b/mysql-test/suite/sys_vars/r/sysvars_server_notembedded.result index 59b9cbb8d96..b626551a570 100644 --- a/mysql-test/suite/sys_vars/r/sysvars_server_notembedded.result +++ b/mysql-test/suite/sys_vars/r/sysvars_server_notembedded.result @@ -2442,6 +2442,16 @@ NUMERIC_BLOCK_SIZE NULL ENUM_VALUE_LIST adjust_secondary_key_cost,disable_max_seek,disable_forced_index_in_group_by,fix_innodb_cardinality READ_ONLY NO COMMAND_LINE_ARGUMENT REQUIRED +VARIABLE_NAME OPTIMIZER_JOIN_LIMIT_PREF_RATIO +VARIABLE_SCOPE SESSION +VARIABLE_TYPE BIGINT UNSIGNED +VARIABLE_COMMENT For queries with JOIN and ORDER BY LIMIT : make the optimizer consider a join order that allows to short-cut execution after producing #LIMIT matches if that promises N times speedup. (A conservative setting here would be is a high value, like 100 so the short-cutting plan is used if it promises a speedup of 100x or more). Short-cutting plans are inherently risky so the default is 0 which means do not consider this optimization +NUMERIC_MIN_VALUE 0 +NUMERIC_MAX_VALUE 4294967295 +NUMERIC_BLOCK_SIZE 1 +ENUM_VALUE_LIST NULL +READ_ONLY NO +COMMAND_LINE_ARGUMENT REQUIRED VARIABLE_NAME OPTIMIZER_MAX_SEL_ARGS VARIABLE_SCOPE SESSION VARIABLE_TYPE BIGINT UNSIGNED diff --git a/sql/sql_class.h b/sql/sql_class.h index a347700f72f..615bf97a0ef 100644 --- a/sql/sql_class.h +++ b/sql/sql_class.h @@ -760,6 +760,7 @@ typedef struct system_variables ulong net_retry_count; ulong net_wait_timeout; ulong net_write_timeout; + ulonglong optimizer_join_limit_pref_ratio; ulong optimizer_prune_level; ulong optimizer_search_depth; ulong optimizer_selectivity_sampling_limit; diff --git a/sql/sql_select.cc b/sql/sql_select.cc index 1b8552a805e..3be4adb092f 100644 --- a/sql/sql_select.cc +++ b/sql/sql_select.cc @@ -233,12 +233,14 @@ static COND *make_cond_for_table_from_pred(THD *thd, Item *root_cond, static Item* part_of_refkey(TABLE *form,Field *field); uint find_shortest_key(TABLE *table, const key_map *usable_keys); -static bool test_if_cheaper_ordering(const JOIN_TAB *tab, +static bool test_if_cheaper_ordering(bool in_join_optimizer, + const JOIN_TAB *tab, ORDER *order, TABLE *table, key_map usable_keys, int key, ha_rows select_limit, int *new_key, int *new_key_direction, ha_rows *new_select_limit, + double *new_read_time, uint *new_used_key_parts= NULL, uint *saved_best_key_parts= NULL); static int test_if_order_by_key(JOIN *join, @@ -330,6 +332,10 @@ static void optimize_rownum(THD *thd, SELECT_LEX_UNIT *unit, Item *cond); static bool process_direct_rownum_comparison(THD *thd, SELECT_LEX_UNIT *unit, Item *cond); +static +bool join_limit_shortcut_is_applicable(const JOIN *join); +POSITION *join_limit_shortcut_finalize_plan(JOIN *join, double *cost); + static bool find_indexes_matching_order(JOIN *join, TABLE *table, ORDER *order, key_map *usable_keys); @@ -5817,6 +5823,7 @@ make_join_statistics(JOIN *join, List &tables_list, join->sort_by_table= get_sort_by_table(join->order, join->group_list, join->select_lex->leaf_tables, join->const_table_map); + join->limit_shortcut_applicable= join_limit_shortcut_is_applicable(join); /* Update info on indexes that can be used for search lookups as reading const tables may has added new sargable predicates. @@ -9175,6 +9182,7 @@ choose_plan(JOIN *join, table_map join_tables) THD *thd= join->thd; DBUG_ENTER("choose_plan"); + join->limit_optimization_mode= false; join->cur_embedding_map= 0; reset_nj_counters(join, join->join_list); qsort2_cmp jtab_sort_func; @@ -9232,9 +9240,45 @@ choose_plan(JOIN *join, table_map join_tables) if (search_depth == 0) /* Automatically determine a reasonable value for 'search_depth' */ search_depth= determine_search_depth(join); + + double limit_cost= DBL_MAX; + POSITION *limit_plan= NULL; + + /* First, build a join plan that can short-cut ORDER BY...LIMIT */ + if (join->limit_shortcut_applicable && !join->emb_sjm_nest) + { + bool res; + Json_writer_object wrapper(join->thd); + Json_writer_array trace(join->thd, "join_limit_shortcut_plan_search"); + join->limit_optimization_mode= true; + res= greedy_search(join, join_tables, search_depth, prune_level, + use_cond_selectivity); + join->limit_optimization_mode= false; + + if (res) + DBUG_RETURN(TRUE); + DBUG_ASSERT(join->best_read != DBL_MAX); + + /* + We've built a join order. Adjust its cost based on ORDER BY...LIMIT + short-cutting. + */ + limit_plan= join_limit_shortcut_finalize_plan(join, &limit_cost); + } + + /* The main call to search for the query plan: */ if (greedy_search(join, join_tables, search_depth, prune_level, use_cond_selectivity)) DBUG_RETURN(TRUE); + + DBUG_ASSERT(join->best_read != DBL_MAX); + if (limit_plan && limit_cost < join->best_read) + { + /* Plan that uses ORDER BY ... LIMIT shortcutting is better. */ + memcpy((uchar*)join->best_positions, (uchar*)limit_plan, + sizeof(POSITION)*join->table_count); + join->best_read= limit_cost; + } } /* @@ -10344,6 +10388,311 @@ check_if_edge_table(POSITION *pos, } +/* + @brief + Check if it is potentally possible to short-cut the JOIN execution due to + ORDER BY ... LIMIT clause + + @detail + It is possible when the join has "ORDER BY ... LIMIT n" clause, and the + sort+limit operation is done right after the join operation (there's no + grouping or DISTINCT in between). + Then we can potentially build a join plan that enumerates rows in the + ORDER BY order and so will be able to terminate as soon as it has produced + #limit rows. + + Note that it is not a requirement that sort_by_table has an index that + matches ORDER BY. If it doesn't have one, the optimizer will pass + sort_by_table to filesort. Reading from sort_by_table won't use + short-cutting but the rest of the join will. +*/ + +static +bool join_limit_shortcut_is_applicable(const JOIN *join) +{ + /* + Any post-join operation like GROUP BY or DISTINCT or window functions + means we cannot short-cut join execution + */ + if (!join->thd->variables.optimizer_join_limit_pref_ratio || + !join->order || + join->select_limit == HA_POS_ERROR || + join->group_list || + join->select_distinct || + join->select_options & SELECT_BIG_RESULT || + join->rollup.state != ROLLUP::STATE_NONE || + join->select_lex->have_window_funcs() || + join->select_lex->with_sum_func) + { + return false; + } + + /* + Cannot do short-cutting if + (1) ORDER BY refers to more than one table or + (2) the table it refers to cannot be first table in the join order + */ + if (!join->sort_by_table || // (1) + join->sort_by_table->reginfo.join_tab->dependent) // (2) + return false; + + Json_writer_object wrapper(join->thd); + Json_writer_object trace(join->thd, "join_limit_shortcut_is_applicable"); + trace.add("applicable", 1); + /* It looks like we can short-cut limit due to join */ + return true; +} + + +/* + @brief + Check if we could use an index-based access method to produce rows + in the order for ORDER BY ... LIMIT. + + @detail + This should do what test_if_skip_sort_order() does. We can't use that + function directly, because: + + 1. We're at the join optimization stage and have not done query plan + fix-ups done in get_best_combination() and co. + + 2. The code in test_if_skip_sort_order() does modify query plan structures, + for example it may change the table's quick select. This is done even if + it's called with no_changes=true parameter. + + @param access_method_changed OUT Whether the function changed the access + method to get rows in desired order. + @param new_access_cost OUT if access method changed: its cost. + + @return + true - Can skip sorting + false - Cannot skip sorting +*/ + +bool test_if_skip_sort_order_early(JOIN *join, + bool *access_method_changed, + double *new_access_cost) +{ + const POSITION *pos= &join->best_positions[join->const_tables]; + TABLE *table= pos->table->table; + key_map usable_keys= table->keys_in_use_for_order_by; + + *access_method_changed= false; + + // Step #1: Find indexes that produce the required ordering. + if (find_indexes_matching_order(join, table, join->order, &usable_keys)) + { + return false; // Cannot skip sorting + } + + // Step #2: Check if the index we're using produces the needed ordering + uint ref_key; + if (pos->key) + { + // Mirror the (wrong) logic in test_if_skip_sort_order: + if (pos->spl_plan || pos->type == JT_REF_OR_NULL) + return false; // Use filesort + + ref_key= pos->key->key; + } + else + { + if (pos->table->quick) + { + if (pos->table->quick->get_type() == QUICK_SELECT_I::QS_TYPE_RANGE) + ref_key= pos->table->quick->index; + else + ref_key= MAX_KEY; + } + else + ref_key= MAX_KEY; + } + + if (ref_key != MAX_KEY && usable_keys.is_set(ref_key)) + { + return true; // we're using an index that produces the reqired ordering. + } + + /* + Step #3: check if we can switch to using an index that would produce the + ordering. + (But don't actually switch, this will be done by test_if_skip_sort_order) + */ + int best_key= -1; + uint UNINIT_VAR(best_key_parts); + uint saved_best_key_parts= 0; + int best_key_direction= 0; + JOIN_TAB *tab= pos->table; + ha_rows new_limit; + double new_read_time; + if (test_if_cheaper_ordering(/*in_join_optimizer */TRUE, + tab, join->order, table, usable_keys, + ref_key, join->select_limit, + &best_key, &best_key_direction, + &new_limit, &new_read_time, + &best_key_parts, + &saved_best_key_parts)) + { + // Ok found a way to skip sorting + *access_method_changed= true; + *new_access_cost= new_read_time; + return true; + } + + return false; +} + + +/* + Compute the cost of join assuming we only need fraction of the output. +*/ + +double recompute_join_cost_with_limit(const JOIN *join, bool skip_sorting, + double *first_table_cost, + double fraction) +{ + POSITION *pos= join->best_positions + join->const_tables; + /* + Generally, we assume that producing X% of output takes X% of the cost. + */ + double partial_join_cost= join->best_read * fraction; + + if (skip_sorting) + { + /* + First table produces rows in required order. Two options: + + A. first_table_cost=NULL means we use whatever access method the join + optimizer has picked. Its cost was included in join->best_read and + we've already took a fraction of it. + + B. first_table_cost!=NULL means we will need to switch to another access + method, we have the cost to read rows to produce #LIMIT rows in join + output. + */ + if (first_table_cost) + { + /* + Subtract the remainder of the first table's cost we had in + join->best_read: + */ + partial_join_cost -= pos->read_time*fraction; + partial_join_cost -= pos->records_read*fraction / TIME_FOR_COMPARE; + + /* Add the cost of the new access method we've got: */ + partial_join_cost= COST_ADD(partial_join_cost, *first_table_cost); + } + } + else + { + DBUG_ASSERT(!first_table_cost); + /* + Cannot skip sorting. We read the first table entirely, then sort it. + + partial_join_cost includes pos->read_time*fraction. Add to it + pos->read_time*(1-fraction) so we have the cost to read the entire first + table. Do the same for costs of checking the WHERE. + */ + double extra_first_table_cost= pos->read_time * (1.0 - fraction); + double extra_first_table_where= pos->records_read * (1.0 - fraction) / + TIME_FOR_COMPARE; + + partial_join_cost= COST_ADD(partial_join_cost, + COST_ADD(extra_first_table_cost, + extra_first_table_where)); + } + return partial_join_cost; +} + + +/* + @brief + Finalize building the join order which allows to short-cut the join + execution. + + @detail + This is called after we have produced a join order that allows short- + cutting. + Here, we decide if it is cheaper to use this one or the original join + order. +*/ + +POSITION *join_limit_shortcut_finalize_plan(JOIN *join, double *cost) +{ + Json_writer_object wrapper(join->thd); + Json_writer_object trace(join->thd, "join_limit_shortcut_choice"); + + double fraction= join->select_limit / join->join_record_count; + trace.add("limit_fraction", fraction); + + /* Check which fraction of join output we need */ + if (fraction >= 1.0) + { + trace.add("skip_adjustment", "no short-cutting"); + return NULL; + } + + /* + Check if the first table's access method produces the required ordering. + Possible options: + 1. Yes: we can just take a fraction of the execution cost. + 2A No: change the access method to one that does produce the required + ordering, update the costs. + 2B No: Need to pass the first table to filesort(). + */ + bool skip_sorting; + bool access_method_changed; + double new_access_cost; + { + Json_writer_array tmp(join->thd, "test_if_skip_sort_order_early"); + skip_sorting= test_if_skip_sort_order_early(join, + &access_method_changed, + &new_access_cost); + } + trace.add("can_skip_filesort", skip_sorting); + + double cost_with_shortcut= + recompute_join_cost_with_limit(join, skip_sorting, + access_method_changed ? + &new_access_cost : (double*)0, + fraction); + double risk_ratio= + (double)join->thd->variables.optimizer_join_limit_pref_ratio; + trace.add("full_join_cost", join->best_read); + trace.add("risk_ratio", risk_ratio); + trace.add("shortcut_join_cost", cost_with_shortcut); + cost_with_shortcut *= risk_ratio; + trace.add("shortcut_cost_with_risk", cost_with_shortcut); + if (cost_with_shortcut < join->best_read) + { + trace.add("use_shortcut_cost", true); + POSITION *pos= (POSITION*)memdup_root(join->thd->mem_root, + join->best_positions, + sizeof(POSITION)* + (join->table_count + 1)); + *cost= cost_with_shortcut; + return pos; + } + trace.add("use_shortcut_cost", false); + return NULL; +} + + +/* + @brief + If we're in Limit Optimization Mode, allow only join->sort_by_table as + the first table in the join order +*/ + +static +bool join_limit_shortcut_allows_table(const JOIN *join, uint idx, JOIN_TAB *s) +{ + if (join->limit_optimization_mode && idx == join->const_tables) + return (join->sort_by_table == s->table); + return true; +} + + /** Find a good, possibly optimal, query execution plan (QEP) by a possibly exhaustive search. @@ -10520,7 +10869,8 @@ best_extension_by_limited_search(JOIN *join, if ((allowed_tables & real_table_bit) && !(remaining_tables & s->dependent) && - !check_interleaving_with_nj(s)) + !check_interleaving_with_nj(s) && + join_limit_shortcut_allows_table(join, idx, s)) { double current_record_count, current_read_time; double partial_join_cardinality; @@ -24790,7 +25140,8 @@ static void prepare_for_reverse_ordered_access(JOIN_TAB *tab) @brief Given a table and order, find indexes that produce rows in the order - @param usable_keys OUT Bitmap of indexes that produce rows in order. + @param usable_keys IN Bitmap of keys we can use + OUT Bitmap of indexes that produce rows in order. @return false Some indexes were found @@ -25089,11 +25440,13 @@ test_if_skip_sort_order(JOIN_TAB *tab,ORDER *order,ha_rows select_limit, int best_key_direction= 0; JOIN *join= tab->join; ha_rows table_records= table->stat_records(); + double new_read_time_dummy; - test_if_cheaper_ordering(tab, order, table, usable_keys, + test_if_cheaper_ordering(FALSE, tab, order, table, usable_keys, ref_key, select_limit, &best_key, &best_key_direction, - &select_limit, &best_key_parts, + &select_limit, &new_read_time_dummy, + &best_key_parts, &saved_best_key_parts); /* @@ -30324,11 +30677,13 @@ static bool get_range_limit_read_cost(const JOIN_TAB *tab, */ static bool -test_if_cheaper_ordering(const JOIN_TAB *tab, ORDER *order, TABLE *table, +test_if_cheaper_ordering(bool in_join_optimizer, + const JOIN_TAB *tab, ORDER *order, TABLE *table, key_map usable_keys, int ref_key, ha_rows select_limit_arg, int *new_key, int *new_key_direction, - ha_rows *new_select_limit, uint *new_used_key_parts, + ha_rows *new_select_limit, double *new_read_time, + uint *new_used_key_parts, uint *saved_best_key_parts) { DBUG_ENTER("test_if_cheaper_ordering"); @@ -30386,7 +30741,7 @@ test_if_cheaper_ordering(const JOIN_TAB *tab, ORDER *order, TABLE *table, if (join) { - uint tablenr= (uint)(tab - join->join_tab); + uint tablenr= join->const_tables; read_time= join->best_positions[tablenr].read_time; records= join->best_positions[tablenr].records_read; for (uint i= tablenr+1; i < join->table_count; i++) @@ -30424,12 +30779,27 @@ test_if_cheaper_ordering(const JOIN_TAB *tab, ORDER *order, TABLE *table, Calculate the selectivity of the ref_key for REF_ACCESS. For RANGE_ACCESS we use table->opt_range_condition_rows. */ - if (ref_key >= 0 && ref_key != MAX_KEY && tab->type == JT_REF) + if (in_join_optimizer) + { + if (ref_key >= 0 && ref_key != MAX_KEY && + join->best_positions[join->const_tables].type == JT_REF) + { + refkey_rows_estimate= + (ha_rows)join->best_positions[join->const_tables].records_read; + set_if_bigger(refkey_rows_estimate, 1); + } + } + else if (ref_key >= 0 && ref_key != MAX_KEY && tab->type == JT_REF) { /* If ref access uses keypart=const for all its key parts, and quick select uses the same # of key parts, then they are equivalent. Reuse #rows estimate from quick select as it is more precise. + + Note: we could just have used + join->best_positions[join->const_tables].records_read + here. That number was computed in best_access_path() and it already + includes adjustments based on table->opt_range[ref_key].rows. */ if (tab->ref.const_ref_part_map == make_prev_keypart_map(tab->ref.key_parts) && @@ -30755,6 +31125,7 @@ test_if_cheaper_ordering(const JOIN_TAB *tab, ORDER *order, TABLE *table, *new_key= best_key; *new_key_direction= best_key_direction; *new_select_limit= has_limit ? best_select_limit : table_records; + *new_read_time= read_time; if (new_used_key_parts != NULL) *new_used_key_parts= best_key_parts; DBUG_RETURN(TRUE); @@ -30854,10 +31225,11 @@ uint get_index_for_order(ORDER *order, TABLE *table, SQL_SELECT *select, table->opt_range_condition_rows= table->stat_records(); int key, direction; - if (test_if_cheaper_ordering(NULL, order, table, + double new_cost; + if (test_if_cheaper_ordering(FALSE, NULL, order, table, table->keys_in_use_for_order_by, -1, limit, - &key, &direction, &limit) && + &key, &direction, &limit, &new_cost) && !is_key_used(table, key, table->write_set)) { *need_sort= FALSE; diff --git a/sql/sql_select.h b/sql/sql_select.h index 128426993ac..59c4ebfc14e 100644 --- a/sql/sql_select.h +++ b/sql/sql_select.h @@ -1205,6 +1205,20 @@ public: passing 1st non-const table to filesort(). NULL means no such table exists. */ TABLE *sort_by_table; + + /* + If true, there is ORDER BY x LIMIT n clause and for certain join orders, it + is possible to short-cut the join execution, i.e. stop it as soon as n + output rows were produced. See join_limit_shortcut_is_applicable(). + */ + bool limit_shortcut_applicable; + + /* + Used during join optimization: if true, we're building a join order that + will short-cut join execution as soon as #LIMIT rows are produced. + */ + bool limit_optimization_mode; + /* Number of tables in the join. (In MySQL, it is named 'tables' and is also the number of elements in diff --git a/sql/sys_vars.cc b/sql/sys_vars.cc index 42ec52b38a9..c5b507b936d 100644 --- a/sql/sys_vars.cc +++ b/sql/sys_vars.cc @@ -2702,6 +2702,20 @@ static Sys_var_ulong Sys_optimizer_selectivity_sampling_limit( VALID_RANGE(SELECTIVITY_SAMPLING_THRESHOLD, UINT_MAX), DEFAULT(SELECTIVITY_SAMPLING_LIMIT), BLOCK_SIZE(1)); +static Sys_var_ulonglong Sys_optimizer_join_limit_pref_ratio( + "optimizer_join_limit_pref_ratio", + "For queries with JOIN and ORDER BY LIMIT : make the optimizer " + "consider a join order that allows to short-cut execution after " + "producing #LIMIT matches if that promises N times speedup. " + "(A conservative setting here would be is a high value, like 100 so " + "the short-cutting plan is used if it promises a speedup of 100x or " + "more). Short-cutting plans are inherently risky so the default is 0 " + "which means do not consider this optimization", + SESSION_VAR(optimizer_join_limit_pref_ratio), + CMD_LINE(REQUIRED_ARG), + VALID_RANGE(0, UINT_MAX), + DEFAULT(0), BLOCK_SIZE(1)); + static Sys_var_ulong Sys_optimizer_use_condition_selectivity( "optimizer_use_condition_selectivity", "Controls selectivity of which conditions the optimizer takes into " From b3cc9529160f8fcdda5ad510f5a4d083f057deab Mon Sep 17 00:00:00 2001 From: Julius Goryavsky Date: Tue, 3 Sep 2024 05:57:22 +0200 Subject: [PATCH 19/21] galera tests: updated .result for galera_gtid_2_cluster test --- mysql-test/suite/galera_3nodes/r/galera_gtid_2_cluster.result | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/mysql-test/suite/galera_3nodes/r/galera_gtid_2_cluster.result b/mysql-test/suite/galera_3nodes/r/galera_gtid_2_cluster.result index 1cb14cd3eff..b849cc9f368 100644 --- a/mysql-test/suite/galera_3nodes/r/galera_gtid_2_cluster.result +++ b/mysql-test/suite/galera_3nodes/r/galera_gtid_2_cluster.result @@ -223,10 +223,14 @@ select @@gtid_binlog_state; drop table t1; stop slave; reset slave; +Warnings: +Note 4190 RESET SLAVE is implicitly changing the value of 'Using_Gtid' from 'Current_Pos' to 'Slave_Pos' cluster 2 node 1 connection node_4; stop slave; reset slave; +Warnings: +Note 4190 RESET SLAVE is implicitly changing the value of 'Using_Gtid' from 'Current_Pos' to 'Slave_Pos' cluster 1 node 1 connection node_1; change master to master_use_gtid=no, ignore_server_ids=(); From fe5829a1214bcb2e4323d3fb826bf607e1059ef5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marko=20M=C3=A4kel=C3=A4?= Date: Wed, 4 Sep 2024 14:24:30 +0300 Subject: [PATCH 20/21] MDEV-34446 SIGSEGV on SET GLOBAL innodb_log_file_size with memory-mapped log file log_t::resize_write(): Advance log_sys.resize_lsn and reset the resize_log offset to START_OFFSET whenever the memory-mapped buffer would wrap around. Previously, in case the initial target offset would be beyond the requested innodb_log_file_size, we only adjusted the offset but not the LSN. An incorrect LSN would cause log_sys.buf_free to be out of bounds when the log resizing completes. The log_sys.lsn_lock will cover the entire duration of replicating memory-mapped log for resizing. We just need a mutex that is compatible with the caller holding log_sys.latch. While the choice of mtr_t::finisher (for normal log writes) depends on mtr_t::spin_wait_delay, replicating the log during resizing is a rare operation where we can afford possible additional context switching overhead. --- storage/innobase/mtr/mtr0mtr.cc | 112 ++++++++++++++------------------ 1 file changed, 49 insertions(+), 63 deletions(-) diff --git a/storage/innobase/mtr/mtr0mtr.cc b/storage/innobase/mtr/mtr0mtr.cc index fe9a882a36c..d531838b5fd 100644 --- a/storage/innobase/mtr/mtr0mtr.cc +++ b/storage/innobase/mtr/mtr0mtr.cc @@ -1217,19 +1217,43 @@ inline void log_t::resize_write(lsn_t lsn, const byte *end, size_t len, if (!resize_flush_buf) { ut_ad(is_pmem()); + lsn_lock.wr_lock(); const size_t resize_capacity{resize_target - START_OFFSET}; - const lsn_t resizing{resize_in_progress()}; - if (UNIV_UNLIKELY(lsn < resizing)) { - size_t l= resizing - lsn; - if (l >= len) - return; - end+= l - len; - len-= l; - lsn+= l; + const lsn_t resizing{resize_in_progress()}; + /* For memory-mapped log, log_t::resize_start() would never + set log_sys.resize_lsn to less than log_sys.lsn. It cannot + execute concurrently with this thread, because we are holding + log_sys.latch and it would hold an exclusive log_sys.latch. */ + if (UNIV_UNLIKELY(lsn < resizing)) + { + /* This function may execute in multiple concurrent threads + that hold a shared log_sys.latch. Before we got lsn_lock, + another thread could have executed resize_lsn.store(lsn) below + with a larger lsn than ours. + + append_prepare() guarantees that the concurrent writes + cannot overlap, that is, our entire log must be discarded. + Besides, incomplete mini-transactions cannot be parsed anyway. */ + ut_ad(resizing >= lsn + len); + goto pmem_done; + } + + s= START_OFFSET; + + if (UNIV_UNLIKELY(lsn - resizing + len >= resize_capacity)) + { + resize_lsn.store(lsn, std::memory_order_relaxed); + lsn= 0; + } + else + { + lsn-= resizing; + s+= lsn; + } } - lsn-= resizing; - s= START_OFFSET + lsn % resize_capacity; + + ut_ad(s + len <= resize_target); if (UNIV_UNLIKELY(end < &buf[START_OFFSET])) { @@ -1239,59 +1263,22 @@ inline void log_t::resize_write(lsn_t lsn, const byte *end, size_t len, ut_ad(end + capacity() + len >= &buf[file_size]); size_t l= size_t(buf - (end - START_OFFSET)); - if (UNIV_LIKELY(s + len <= resize_target)) - { - /* The destination buffer (log_sys.resize_buf) did not wrap around */ - memcpy(resize_buf + s, end + capacity(), l); - memcpy(resize_buf + s + l, &buf[START_OFFSET], len - l); - goto pmem_nowrap; - } - else - { - /* Both log_sys.buf and log_sys.resize_buf wrapped around */ - const size_t rl= resize_target - s; - if (l <= rl) - { - /* log_sys.buf wraps around first */ - memcpy(resize_buf + s, end + capacity(), l); - memcpy(resize_buf + s + l, &buf[START_OFFSET], rl - l); - memcpy(resize_buf + START_OFFSET, &buf[START_OFFSET + rl - l], - len - l); - } - else - { - /* log_sys.resize_buf wraps around first */ - memcpy(resize_buf + s, end + capacity(), rl); - memcpy(resize_buf + START_OFFSET, end + capacity() + rl, l - rl); - memcpy(resize_buf + START_OFFSET + (l - rl), - &buf[START_OFFSET], len - l); - } - goto pmem_wrap; - } + memcpy(resize_buf + s, end + capacity(), l); + memcpy(resize_buf + s + l, &buf[START_OFFSET], len - l); } else { ut_ad(end + len <= &buf[file_size]); - - if (UNIV_LIKELY(s + len <= resize_target)) - { - memcpy(resize_buf + s, end, len); - pmem_nowrap: - s+= len - seq; - } - else - { - /* The log_sys.resize_buf wrapped around */ - memcpy(resize_buf + s, end, resize_target - s); - memcpy(resize_buf + START_OFFSET, end + (resize_target - s), - len - (resize_target - s)); - pmem_wrap: - s+= len - seq; - if (s >= resize_target) - s-= resize_capacity; - resize_lsn.fetch_add(resize_capacity); /* Move the target ahead. */ - } + memcpy(resize_buf + s, end, len); } + s+= len - seq; + + /* Always set the sequence bit. If the resized log were to wrap around, + we will advance resize_lsn. */ + ut_ad(resize_buf[s] <= 1); + resize_buf[s]= 1; + pmem_done: + lsn_lock.wr_unlock(); } else #endif @@ -1301,12 +1288,11 @@ inline void log_t::resize_write(lsn_t lsn, const byte *end, size_t len, ut_ad(s + len <= buf_size); memcpy(resize_buf + s, end, len); s+= len - seq; + /* Always set the sequence bit. If the resized log were to wrap around, + we will advance resize_lsn. */ + ut_ad(resize_buf[s] <= 1); + resize_buf[s]= 1; } - - /* Always set the sequence bit. If the resized log were to wrap around, - we will advance resize_lsn. */ - ut_ad(resize_buf[s] <= 1); - resize_buf[s]= 1; } } From 4972f9fc0f8bc0d8e85be1291b2bcbc64d3dd6f1 Mon Sep 17 00:00:00 2001 From: Thirunarayanan Balathandayuthapani Date: Thu, 5 Sep 2024 16:24:16 +0530 Subject: [PATCH 21/21] MDEV-33087 ALTER TABLE...ALGORITHM=COPY should build indexes more efficiently - Remove the usage of alter_algorithm variable and disable the persistent statistics in alter_copy_bulk test case. --- .../suite/innodb/r/alter_copy_bulk,OFF.rdiff | 6 ++--- .../suite/innodb/r/alter_copy_bulk.result | 22 ++++++++++--------- .../suite/innodb/t/alter_copy_bulk.test | 22 ++++++++++--------- 3 files changed, 27 insertions(+), 23 deletions(-) diff --git a/mysql-test/suite/innodb/r/alter_copy_bulk,OFF.rdiff b/mysql-test/suite/innodb/r/alter_copy_bulk,OFF.rdiff index a644139ea78..6570c52def1 100644 --- a/mysql-test/suite/innodb/r/alter_copy_bulk,OFF.rdiff +++ b/mysql-test/suite/innodb/r/alter_copy_bulk,OFF.rdiff @@ -1,9 +1,9 @@ --- bulk_copy_alter.result +++ bulk_copy_alter,non_bulk_alter_copy.result -@@ -5,7 +5,7 @@ +@@ -6,7 +6,7 @@ INSERT INTO t1 SELECT repeat('b', 200), seq FROM seq_3_to_65536; - ALTER TABLE t1 ADD INDEX(f2); - ALTER TABLE t1 ADD PRIMARY KEY(f1(2)); + ALTER TABLE t1 ALGORITHM=COPY, ADD INDEX(f2); + ALTER TABLE t1 ALGORITHM=COPY, ADD PRIMARY KEY(f1(2)); -ERROR 23000: Duplicate entry 'bb' for key 'PRIMARY' +ERROR 23000: Duplicate entry 'aa' for key 'PRIMARY' INSERT INTO t1 VALUES(repeat('a', 200), 1); diff --git a/mysql-test/suite/innodb/r/alter_copy_bulk.result b/mysql-test/suite/innodb/r/alter_copy_bulk.result index 53c7e8a812d..cd83e65a5f4 100644 --- a/mysql-test/suite/innodb/r/alter_copy_bulk.result +++ b/mysql-test/suite/innodb/r/alter_copy_bulk.result @@ -1,10 +1,11 @@ -SET @@alter_algorithm=COPY; +SET @default_stats_persistent= @@global.innodb_stats_persistent; +SET GLOBAL innodb_stats_persistent= 0; CREATE TABLE t1(f1 CHAR(200), f2 INT NOT NULL)engine=InnoDB; INSERT INTO t1 SELECT repeat('a', 200), seq FROM seq_1_to_2; -ALTER TABLE t1 FORCE; +ALTER TABLE t1 ALGORITHM=COPY, FORCE; INSERT INTO t1 SELECT repeat('b', 200), seq FROM seq_3_to_65536; -ALTER TABLE t1 ADD INDEX(f2); -ALTER TABLE t1 ADD PRIMARY KEY(f1(2)); +ALTER TABLE t1 ALGORITHM=COPY, ADD INDEX(f2); +ALTER TABLE t1 ALGORITHM=COPY, ADD PRIMARY KEY(f1(2)); ERROR 23000: Duplicate entry 'bb' for key 'PRIMARY' INSERT INTO t1 VALUES(repeat('a', 200), 1); ALTER TABLE t1 ADD UNIQUE KEY(f2); @@ -13,14 +14,14 @@ ALTER IGNORE TABLE t1 MODIFY f1 CHAR(200) NOT NULL; CREATE TABLE t2(f1 INT NOT NULL, FOREIGN KEY(f1) REFERENCES t1(f2))ENGINE=InnoDB; INSERT INTO t2 VALUES(1); -ALTER TABLE t2 FORCE; +ALTER TABLE t2 ALGORITHM=COPY, FORCE; DROP TABLE t2, t1; CREATE TABLE t1 (f1 INT, f2 INT) ENGINE=InnoDB PARTITION BY HASH(f1) PARTITIONS 2; INSERT INTO t1 VALUES(1, 1); INSERT INTO t1 SELECT seq, seq * 2 FROM seq_1_to_2; -ALTER TABLE t1 FORCE; +ALTER TABLE t1 ALGORITHM=COPY, FORCE; INSERT INTO t1 SELECT seq, seq * 2 FROM seq_3_to_65536; -ALTER TABLE t1 ADD INDEX(f2); +ALTER TABLE t1 ALGORITHM=COPY, ADD INDEX(f2); DROP TABLE t1; # # MDEV-34756 Validation of new foreign key skipped @@ -39,13 +40,14 @@ ALTER TABLE t2 ADD CONSTRAINT FOREIGN KEY(f2) REFERENCES t1(f1); ERROR 23000: Cannot add or update a child row: a foreign key constraint fails (`test`.`#sql-alter`, CONSTRAINT `#sql-alter_ibfk_2` FOREIGN KEY (`f2`) REFERENCES `t1` (`f1`)) INSERT INTO t1 VALUES(3, 1); SET STATEMENT foreign_key_checks=0 FOR -ALTER TABLE t2 ADD CONSTRAINT FOREIGN KEY(f2) REFERENCES t1(f1); +ALTER TABLE t2 ALGORITHM=COPY, ADD CONSTRAINT FOREIGN KEY(f2) REFERENCES t1(f1); affected rows: 1 info: Records: 1 Duplicates: 0 Warnings: 0 -ALTER TABLE t1 FORCE; +ALTER TABLE t1 ALGORITHM=COPY, FORCE; affected rows: 2 info: Records: 2 Duplicates: 0 Warnings: 0 -ALTER TABLE t2 FORCE; +ALTER TABLE t2 ALGORITHM=COPY, FORCE; affected rows: 1 info: Records: 1 Duplicates: 0 Warnings: 0 DROP TABLE t2, t1; +SET GLOBAL innodb_stats_persistent=@default_stats_persistent; diff --git a/mysql-test/suite/innodb/t/alter_copy_bulk.test b/mysql-test/suite/innodb/t/alter_copy_bulk.test index ae815cd4e30..2e8764578d5 100644 --- a/mysql-test/suite/innodb/t/alter_copy_bulk.test +++ b/mysql-test/suite/innodb/t/alter_copy_bulk.test @@ -1,21 +1,22 @@ --source include/have_innodb.inc --source include/have_partition.inc --source include/have_sequence.inc -SET @@alter_algorithm=COPY; +SET @default_stats_persistent= @@global.innodb_stats_persistent; +SET GLOBAL innodb_stats_persistent= 0; CREATE TABLE t1(f1 CHAR(200), f2 INT NOT NULL)engine=InnoDB; INSERT INTO t1 SELECT repeat('a', 200), seq FROM seq_1_to_2; # Buffer fits in the memory -ALTER TABLE t1 FORCE; +ALTER TABLE t1 ALGORITHM=COPY, FORCE; # Insert more entries INSERT INTO t1 SELECT repeat('b', 200), seq FROM seq_3_to_65536; # Alter should use temporary file for sorting -ALTER TABLE t1 ADD INDEX(f2); +ALTER TABLE t1 ALGORITHM=COPY, ADD INDEX(f2); # Error while buffering the insert operation --error ER_DUP_ENTRY -ALTER TABLE t1 ADD PRIMARY KEY(f1(2)); +ALTER TABLE t1 ALGORITHM=COPY, ADD PRIMARY KEY(f1(2)); INSERT INTO t1 VALUES(repeat('a', 200), 1); # Error while applying the bulk insert operation @@ -29,18 +30,18 @@ CREATE TABLE t2(f1 INT NOT NULL, FOREIGN KEY(f1) REFERENCES t1(f2))ENGINE=InnoDB; INSERT INTO t2 VALUES(1); # Bulk operation shouldn't happen because of foreign key constraints -ALTER TABLE t2 FORCE; +ALTER TABLE t2 ALGORITHM=COPY, FORCE; DROP TABLE t2, t1; CREATE TABLE t1 (f1 INT, f2 INT) ENGINE=InnoDB PARTITION BY HASH(f1) PARTITIONS 2; INSERT INTO t1 VALUES(1, 1); INSERT INTO t1 SELECT seq, seq * 2 FROM seq_1_to_2; # Buffer fits in the memory -ALTER TABLE t1 FORCE; +ALTER TABLE t1 ALGORITHM=COPY, FORCE; # Insert more entries INSERT INTO t1 SELECT seq, seq * 2 FROM seq_3_to_65536; # Alter should use temporary file for sorting -ALTER TABLE t1 ADD INDEX(f2); +ALTER TABLE t1 ALGORITHM=COPY, ADD INDEX(f2); DROP TABLE t1; --echo # @@ -62,8 +63,9 @@ ALTER TABLE t2 ADD CONSTRAINT FOREIGN KEY(f2) REFERENCES t1(f1); INSERT INTO t1 VALUES(3, 1); --enable_info SET STATEMENT foreign_key_checks=0 FOR -ALTER TABLE t2 ADD CONSTRAINT FOREIGN KEY(f2) REFERENCES t1(f1); -ALTER TABLE t1 FORCE; -ALTER TABLE t2 FORCE; +ALTER TABLE t2 ALGORITHM=COPY, ADD CONSTRAINT FOREIGN KEY(f2) REFERENCES t1(f1); +ALTER TABLE t1 ALGORITHM=COPY, FORCE; +ALTER TABLE t2 ALGORITHM=COPY, FORCE; --disable_info DROP TABLE t2, t1; +SET GLOBAL innodb_stats_persistent=@default_stats_persistent;