From 7fb17e42cf2f6f309f43907f2db84389d8d895e8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marko=20M=C3=A4kel=C3=A4?= Date: Mon, 10 Jan 2011 15:34:45 +0200 Subject: [PATCH 01/15] Bug#59181 InnoDB compilation failure on the Sun Studio compiler Define UNIV_PREFETCH_R(add) as sun_prefetch_read_many((void*) addr), because apparently some versions of the Sun library omit the const qualifier. --- storage/innodb_plugin/include/univ.i | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/storage/innodb_plugin/include/univ.i b/storage/innodb_plugin/include/univ.i index bbff8ddf1e3..4425950748b 100644 --- a/storage/innodb_plugin/include/univ.i +++ b/storage/innodb_plugin/include/univ.i @@ -412,7 +412,7 @@ it is read or written. */ /* Use sun_prefetch when compile with Sun Studio */ # define UNIV_EXPECT(expr,value) (expr) # define UNIV_LIKELY_NULL(expr) (expr) -# define UNIV_PREFETCH_R(addr) sun_prefetch_read_many(addr) +# define UNIV_PREFETCH_R(addr) sun_prefetch_read_many((void*) addr) # define UNIV_PREFETCH_RW(addr) sun_prefetch_write_many(addr) #else /* Dummy versions of the macros */ From 634fe860562f7249c55a0f946a1fb50972b9f9ff Mon Sep 17 00:00:00 2001 From: Vasil Dimov Date: Wed, 12 Jan 2011 17:53:05 +0200 Subject: [PATCH 02/15] Suppress InnoDB warning about long semaphore wait if running under Valgrind Sometimes Valgrind could be extremely slow and could trigger the InnoDB diagnostic message making the test to fail. --- mysql-test/suite/innodb/t/innodb_bug56143.test | 5 +++++ mysql-test/suite/innodb_plugin/t/innodb_bug56143.test | 5 +++++ 2 files changed, 10 insertions(+) diff --git a/mysql-test/suite/innodb/t/innodb_bug56143.test b/mysql-test/suite/innodb/t/innodb_bug56143.test index 1218ae6621c..b69d0048ee8 100644 --- a/mysql-test/suite/innodb/t/innodb_bug56143.test +++ b/mysql-test/suite/innodb/t/innodb_bug56143.test @@ -8,6 +8,11 @@ -- disable_query_log -- disable_result_log +if ($VALGRIND_TEST) +{ + call mtr.add_suppression("InnoDB: Warning: a long semaphore wait:"); +} + SET foreign_key_checks=0; DROP TABLE IF EXISTS bug56143; CREATE TABLE `bug56143` ( diff --git a/mysql-test/suite/innodb_plugin/t/innodb_bug56143.test b/mysql-test/suite/innodb_plugin/t/innodb_bug56143.test index 7c7472303db..0f135a44f5d 100644 --- a/mysql-test/suite/innodb_plugin/t/innodb_bug56143.test +++ b/mysql-test/suite/innodb_plugin/t/innodb_bug56143.test @@ -8,6 +8,11 @@ -- disable_query_log -- disable_result_log +if ($VALGRIND_TEST) +{ + call mtr.add_suppression("InnoDB: Warning: a long semaphore wait:"); +} + SET foreign_key_checks=0; DROP TABLE IF EXISTS bug56143_1; From 9cd4d4984025857782e12e53d32cea5e4b7684e5 Mon Sep 17 00:00:00 2001 From: Jimmy Yang Date: Fri, 14 Jan 2011 09:02:28 -0800 Subject: [PATCH 03/15] Fix Bug#30423 "InnoDBs treatment of NULL in index stats causes bad "rows examined" estimates". This change implements "innodb_stats_method" with options of "nulls_equal", "nulls_unequal" and "null_ignored". rb://553 approved by Marko --- .../suite/innodb/r/innodb_bug30423.result | 95 ++++++++ .../suite/innodb/t/innodb_bug30423.test | 211 ++++++++++++++++++ .../innodb_plugin/r/innodb_bug30423.result | 95 ++++++++ .../innodb_plugin/t/innodb_bug30423.test | 211 ++++++++++++++++++ storage/innobase/btr/btr0cur.c | 146 +++++++++--- storage/innobase/dict/dict0dict.c | 10 + storage/innobase/handler/ha_innodb.cc | 95 +++++++- storage/innobase/include/btr0cur.h | 5 +- storage/innobase/include/dict0mem.h | 6 + storage/innobase/include/rem0cmp.h | 4 + storage/innobase/include/rem0cmp.ic | 2 +- storage/innobase/include/srv0srv.h | 18 ++ storage/innobase/rem/rem0cmp.c | 14 +- storage/innobase/srv/srv0srv.c | 5 + storage/innodb_plugin/ChangeLog | 8 + storage/innodb_plugin/btr/btr0cur.c | 150 ++++++++++--- storage/innodb_plugin/dict/dict0dict.c | 10 + storage/innodb_plugin/handler/ha_innodb.cc | 95 +++++++- storage/innodb_plugin/include/btr0cur.h | 5 +- storage/innodb_plugin/include/dict0mem.h | 6 + storage/innodb_plugin/include/rem0cmp.h | 4 + storage/innodb_plugin/include/rem0cmp.ic | 2 +- storage/innodb_plugin/include/srv0srv.h | 18 ++ storage/innodb_plugin/rem/rem0cmp.c | 14 +- storage/innodb_plugin/srv/srv0srv.c | 5 + 25 files changed, 1157 insertions(+), 77 deletions(-) create mode 100644 mysql-test/suite/innodb/r/innodb_bug30423.result create mode 100644 mysql-test/suite/innodb/t/innodb_bug30423.test create mode 100644 mysql-test/suite/innodb_plugin/r/innodb_bug30423.result create mode 100644 mysql-test/suite/innodb_plugin/t/innodb_bug30423.test diff --git a/mysql-test/suite/innodb/r/innodb_bug30423.result b/mysql-test/suite/innodb/r/innodb_bug30423.result new file mode 100644 index 00000000000..a19809366ae --- /dev/null +++ b/mysql-test/suite/innodb/r/innodb_bug30423.result @@ -0,0 +1,95 @@ +set global innodb_stats_method = default; +select @@innodb_stats_method; +@@innodb_stats_method +nulls_equal +select count(*) from bug30243_3 where org_id is not NULL; +count(*) +20 +select count(*) from bug30243_3 where org_id is NULL; +count(*) +16384 +select count(*) from bug30243_2 where org_id is not NULL; +count(*) +224 +select count(*) from bug30243_2 where org_id is NULL; +count(*) +65536 +select @@innodb_stats_method; +@@innodb_stats_method +nulls_equal +analyze table bug30243_1; +Table Op Msg_type Msg_text +test.bug30243_1 analyze status OK +analyze table bug30243_2; +Table Op Msg_type Msg_text +test.bug30243_2 analyze status OK +analyze table bug30243_3; +Table Op Msg_type Msg_text +test.bug30243_3 analyze status OK +set global innodb_stats_method = "NULL"; +ERROR 42000: Variable 'stats_method' can't be set to the value of 'NULL' +set global innodb_stats_method = "nulls_ignored"; +select @@innodb_stats_method; +@@innodb_stats_method +nulls_ignored +analyze table bug30243_1; +Table Op Msg_type Msg_text +test.bug30243_1 analyze status OK +analyze table bug30243_2; +Table Op Msg_type Msg_text +test.bug30243_2 analyze status OK +analyze table bug30243_3; +Table Op Msg_type Msg_text +test.bug30243_3 analyze status OK +explain SELECT COUNT(*), 0 +FROM bug30243_1 orgs +LEFT JOIN bug30243_3 sa_opportunities +ON orgs.org_id=sa_opportunities.org_id +LEFT JOIN bug30243_2 contacts +ON orgs.org_id=contacts.org_id ; +id select_type table type possible_keys key key_len ref rows Extra +1 SIMPLE orgs index NULL org_id 4 NULL 128 Using index +1 SIMPLE sa_opportunities ref org_id org_id 5 test.orgs.org_id 1 Using index +1 SIMPLE contacts ref contacts$org_id contacts$org_id 5 test.orgs.org_id 1 Using index +select @@innodb_stats_method; +@@innodb_stats_method +nulls_ignored +set global innodb_stats_method = "nulls_unequal"; +select @@innodb_stats_method; +@@innodb_stats_method +nulls_unequal +analyze table bug30243_1; +Table Op Msg_type Msg_text +test.bug30243_1 analyze status OK +analyze table bug30243_2; +Table Op Msg_type Msg_text +test.bug30243_2 analyze status OK +analyze table bug30243_3; +Table Op Msg_type Msg_text +test.bug30243_3 analyze status OK +explain SELECT COUNT(*), 0 +FROM bug30243_1 orgs +LEFT JOIN bug30243_3 sa_opportunities +ON orgs.org_id=sa_opportunities.org_id +LEFT JOIN bug30243_2 contacts +ON orgs.org_id=contacts.org_id; +id select_type table type possible_keys key key_len ref rows Extra +1 SIMPLE orgs index NULL org_id 4 NULL 128 Using index +1 SIMPLE sa_opportunities ref org_id org_id 5 test.orgs.org_id 1 Using index +1 SIMPLE contacts ref contacts$org_id contacts$org_id 5 test.orgs.org_id 1 Using index +SELECT COUNT(*) FROM table_bug30423 WHERE org_id IS NULL; +COUNT(*) +1024 +set global innodb_stats_method = "nulls_unequal"; +analyze table table_bug30423; +Table Op Msg_type Msg_text +test.table_bug30423 analyze status OK +set global innodb_stats_method = "nulls_ignored"; +analyze table table_bug30423; +Table Op Msg_type Msg_text +test.table_bug30423 analyze status OK +set global innodb_stats_method = nulls_equal; +drop table bug30243_2; +drop table bug30243_1; +drop table bug30243_3; +drop table table_bug30423; diff --git a/mysql-test/suite/innodb/t/innodb_bug30423.test b/mysql-test/suite/innodb/t/innodb_bug30423.test new file mode 100644 index 00000000000..f2a3ee8d099 --- /dev/null +++ b/mysql-test/suite/innodb/t/innodb_bug30423.test @@ -0,0 +1,211 @@ +# Test for Bug #30423, InnoDBs treatment of NULL in index stats causes +# bad "rows examined" estimates. +# Implemented InnoDB system variable "innodb_stats_method" with +# "nulls_equal" (default), "nulls_unequal", and "nulls_ignored" options. + +-- source include/have_innodb.inc + +let $innodb_stats_method_orig = `select @@innodb_stats_method`; + +# default setting for innodb_stats_method is "nulls_equal" +set global innodb_stats_method = default; + +select @@innodb_stats_method; + +# create three tables, bug30243_1, bug30243_2 and bug30243_3. +# The test scenario is adopted from original bug #30423 report. +# table bug30243_1 and bug30243_3 have many NULL values + +-- disable_result_log +-- disable_query_log + +DROP TABLE IF EXISTS bug30243_1; +CREATE TABLE bug30243_1 ( + org_id int(11) NOT NULL default '0', + UNIQUE KEY (org_id) +) ENGINE=InnoDB DEFAULT CHARSET=latin1; + +LOCK TABLES bug30243_1 WRITE; +INSERT INTO bug30243_1 VALUES (11),(15),(16),(17),(19),(20),(21),(23),(24), +(25),(26),(27),(28),(29),(30),(31),(32),(33),(34),(35),(37),(38),(40),(41), +(42),(43),(44),(45),(46),(47),(48),(49),(50),(51),(52),(53),(54),(55),(56), +(57),(58),(59),(60),(61),(62),(63),(64),(65),(66),(67),(68),(69),(70),(71), +(72),(73),(74),(75),(76),(77),(78),(79),(80),(81),(82),(83),(84),(85),(86), +(87),(88),(89),(90),(91),(92),(93),(94),(95),(96),(97),(98),(99),(100),(101), +(102),(103),(104),(105),(106),(107),(108),(109),(110),(111),(112),(113),(114), +(115),(116),(117),(118),(119),(120),(121),(122),(123),(124),(125),(126),(127), +(128),(129),(130),(131),(132),(133),(134),(135),(136),(137),(138),(139),(140), +(141),(142),(143),(144),(145); +UNLOCK TABLES; + +DROP TABLE IF EXISTS bug30243_3; +CREATE TABLE bug30243_3 ( + org_id int(11) default NULL, + KEY (org_id) +) ENGINE=InnoDB DEFAULT CHARSET=latin1; + +INSERT INTO bug30243_3 VALUES (NULL); + +begin; +let $i=14; +while ($i) +{ + INSERT INTO bug30243_3 SELECT NULL FROM bug30243_3; + dec $i; +} + +INSERT INTO bug30243_3 VALUES (34),(34),(35),(56),(58),(62),(62),(64),(65),(66),(80),(135),(137),(138),(139),(140),(142),(143),(144),(145); +commit; + +DROP TABLE IF EXISTS bug30243_2; +CREATE TABLE bug30243_2 ( + org_id int(11) default NULL, + KEY `contacts$org_id` (org_id) +) ENGINE=InnoDB DEFAULT CHARSET=latin1; + +INSERT INTO bug30243_2 VALUES (NULL); + +begin; +let $i=16; +while ($i) +{ + INSERT INTO bug30243_2 SELECT NULL FROM bug30243_2; + dec $i; +} + +INSERT INTO bug30243_2 VALUES (11),(15),(16),(17),(20),(21),(23),(24),(25), +(26),(27),(28),(29),(30),(31),(32),(33),(34),(37),(38),(40),(41),(42),(43), +(44),(45),(46),(46),(46),(46),(46),(46),(46),(46),(46),(46),(46),(46),(46), +(46),(46),(46),(46),(46),(46),(46),(46),(46),(46),(46),(46),(46),(46),(46), +(46),(46),(46),(46),(46),(46),(46),(46),(46),(46),(46),(46),(46),(46),(46), +(46),(46),(46),(46),(46),(46),(46),(46),(46),(46),(46),(46),(46),(46),(46), +(46),(46),(46),(46),(46),(46),(46),(46),(46),(46),(46),(46),(46),(46),(46), +(46),(46),(46),(46),(46),(46),(46),(46),(46),(46),(46),(46),(46),(46),(48), +(48),(50),(51),(52),(52),(53),(54),(55),(57),(60),(61),(62),(62),(62),(62), +(62),(63),(64),(64),(65),(66),(66),(67),(68),(69),(70),(71),(72),(73),(74), +(75),(76),(77),(78),(79),(80),(80),(81),(82),(83),(84),(85),(86),(87),(88), +(89),(90),(91),(92),(93),(94),(95),(96),(97),(98),(99),(100),(101),(102), +(103),(104),(105),(106),(107),(108),(109),(110),(111),(112),(113),(114), +(115),(116),(117),(118),(119),(120),(121),(122),(123),(124),(125),(126), +(127),(128),(129),(130),(131),(132),(133),(133),(135),(135),(135),(135), +(136),(136),(138),(138),(139),(139),(139),(140),(141),(141),(142),(143), +(143),(145),(145); +commit; + + +-- enable_result_log +-- enable_query_log + +# check tables's value +select count(*) from bug30243_3 where org_id is not NULL; +select count(*) from bug30243_3 where org_id is NULL; + +select count(*) from bug30243_2 where org_id is not NULL; +select count(*) from bug30243_2 where org_id is NULL; + +select @@innodb_stats_method; + +analyze table bug30243_1; +analyze table bug30243_2; +analyze table bug30243_3; + +# Following query plan shows that we over estimate the rows per +# unique value (since there are many NULLs). +# Skip this query log since the stats estimate could vary from runs +-- disable_query_log +-- disable_result_log +explain SELECT COUNT(*), 0 + FROM bug30243_1 orgs + LEFT JOIN bug30243_3 sa_opportunities + ON orgs.org_id=sa_opportunities.org_id + LEFT JOIN bug30243_2 contacts + ON orgs.org_id=contacts.org_id ; +-- enable_query_log +-- enable_result_log + +# following set operation will fail +#--error ER_WRONG_VALUE_FOR_VAR +--error 1231 +set global innodb_stats_method = "NULL"; + +set global innodb_stats_method = "nulls_ignored"; + +select @@innodb_stats_method; + +# Regenerate the stats with "nulls_ignored" option + +analyze table bug30243_1; +analyze table bug30243_2; +analyze table bug30243_3; + +# Following query plan shows that we get the correct rows per +# unique value (should be approximately 1 row per value) +explain SELECT COUNT(*), 0 + FROM bug30243_1 orgs + LEFT JOIN bug30243_3 sa_opportunities + ON orgs.org_id=sa_opportunities.org_id + LEFT JOIN bug30243_2 contacts + ON orgs.org_id=contacts.org_id ; + +select @@innodb_stats_method; + +# Try the "nulls_unequal" option +set global innodb_stats_method = "nulls_unequal"; + +select @@innodb_stats_method; + +analyze table bug30243_1; +analyze table bug30243_2; +analyze table bug30243_3; + +# Following query plan shows that we get the correct rows per +# unique value (~1) +explain SELECT COUNT(*), 0 + FROM bug30243_1 orgs + LEFT JOIN bug30243_3 sa_opportunities + ON orgs.org_id=sa_opportunities.org_id + LEFT JOIN bug30243_2 contacts + ON orgs.org_id=contacts.org_id; + + +# Create a table with all NULL values, make sure the stats calculation +# does not crash with table of all NULL values +-- disable_query_log +CREATE TABLE table_bug30423 ( + org_id int(11) default NULL, + KEY(org_id) +) ENGINE=InnoDB DEFAULT CHARSET=latin1; + +INSERT INTO `table_bug30423` VALUES (NULL); + +begin; +let $i=10; +while ($i) +{ + INSERT INTO table_bug30423 SELECT NULL FROM table_bug30423; + dec $i; +} +commit; + +-- enable_query_log + +SELECT COUNT(*) FROM table_bug30423 WHERE org_id IS NULL; + +# calculate the statistics for the table for "nulls_ignored" and +# "nulls_unequal" option +set global innodb_stats_method = "nulls_unequal"; +analyze table table_bug30423; + +set global innodb_stats_method = "nulls_ignored"; +analyze table table_bug30423; + + +eval set global innodb_stats_method = $innodb_stats_method_orig; + +drop table bug30243_2; + +drop table bug30243_1; + +drop table bug30243_3; + +drop table table_bug30423; diff --git a/mysql-test/suite/innodb_plugin/r/innodb_bug30423.result b/mysql-test/suite/innodb_plugin/r/innodb_bug30423.result new file mode 100644 index 00000000000..a19809366ae --- /dev/null +++ b/mysql-test/suite/innodb_plugin/r/innodb_bug30423.result @@ -0,0 +1,95 @@ +set global innodb_stats_method = default; +select @@innodb_stats_method; +@@innodb_stats_method +nulls_equal +select count(*) from bug30243_3 where org_id is not NULL; +count(*) +20 +select count(*) from bug30243_3 where org_id is NULL; +count(*) +16384 +select count(*) from bug30243_2 where org_id is not NULL; +count(*) +224 +select count(*) from bug30243_2 where org_id is NULL; +count(*) +65536 +select @@innodb_stats_method; +@@innodb_stats_method +nulls_equal +analyze table bug30243_1; +Table Op Msg_type Msg_text +test.bug30243_1 analyze status OK +analyze table bug30243_2; +Table Op Msg_type Msg_text +test.bug30243_2 analyze status OK +analyze table bug30243_3; +Table Op Msg_type Msg_text +test.bug30243_3 analyze status OK +set global innodb_stats_method = "NULL"; +ERROR 42000: Variable 'stats_method' can't be set to the value of 'NULL' +set global innodb_stats_method = "nulls_ignored"; +select @@innodb_stats_method; +@@innodb_stats_method +nulls_ignored +analyze table bug30243_1; +Table Op Msg_type Msg_text +test.bug30243_1 analyze status OK +analyze table bug30243_2; +Table Op Msg_type Msg_text +test.bug30243_2 analyze status OK +analyze table bug30243_3; +Table Op Msg_type Msg_text +test.bug30243_3 analyze status OK +explain SELECT COUNT(*), 0 +FROM bug30243_1 orgs +LEFT JOIN bug30243_3 sa_opportunities +ON orgs.org_id=sa_opportunities.org_id +LEFT JOIN bug30243_2 contacts +ON orgs.org_id=contacts.org_id ; +id select_type table type possible_keys key key_len ref rows Extra +1 SIMPLE orgs index NULL org_id 4 NULL 128 Using index +1 SIMPLE sa_opportunities ref org_id org_id 5 test.orgs.org_id 1 Using index +1 SIMPLE contacts ref contacts$org_id contacts$org_id 5 test.orgs.org_id 1 Using index +select @@innodb_stats_method; +@@innodb_stats_method +nulls_ignored +set global innodb_stats_method = "nulls_unequal"; +select @@innodb_stats_method; +@@innodb_stats_method +nulls_unequal +analyze table bug30243_1; +Table Op Msg_type Msg_text +test.bug30243_1 analyze status OK +analyze table bug30243_2; +Table Op Msg_type Msg_text +test.bug30243_2 analyze status OK +analyze table bug30243_3; +Table Op Msg_type Msg_text +test.bug30243_3 analyze status OK +explain SELECT COUNT(*), 0 +FROM bug30243_1 orgs +LEFT JOIN bug30243_3 sa_opportunities +ON orgs.org_id=sa_opportunities.org_id +LEFT JOIN bug30243_2 contacts +ON orgs.org_id=contacts.org_id; +id select_type table type possible_keys key key_len ref rows Extra +1 SIMPLE orgs index NULL org_id 4 NULL 128 Using index +1 SIMPLE sa_opportunities ref org_id org_id 5 test.orgs.org_id 1 Using index +1 SIMPLE contacts ref contacts$org_id contacts$org_id 5 test.orgs.org_id 1 Using index +SELECT COUNT(*) FROM table_bug30423 WHERE org_id IS NULL; +COUNT(*) +1024 +set global innodb_stats_method = "nulls_unequal"; +analyze table table_bug30423; +Table Op Msg_type Msg_text +test.table_bug30423 analyze status OK +set global innodb_stats_method = "nulls_ignored"; +analyze table table_bug30423; +Table Op Msg_type Msg_text +test.table_bug30423 analyze status OK +set global innodb_stats_method = nulls_equal; +drop table bug30243_2; +drop table bug30243_1; +drop table bug30243_3; +drop table table_bug30423; diff --git a/mysql-test/suite/innodb_plugin/t/innodb_bug30423.test b/mysql-test/suite/innodb_plugin/t/innodb_bug30423.test new file mode 100644 index 00000000000..458c2967e19 --- /dev/null +++ b/mysql-test/suite/innodb_plugin/t/innodb_bug30423.test @@ -0,0 +1,211 @@ +# Test for Bug #30423, InnoDBs treatment of NULL in index stats causes +# bad "rows examined" estimates. +# Implemented InnoDB system variable "innodb_stats_method" with +# "nulls_equal" (default), "nulls_unequal", and "nulls_ignored" options. + +-- source include/have_innodb_plugin.inc + +let $innodb_stats_method_orig = `select @@innodb_stats_method`; + +# default setting for innodb_stats_method is "nulls_equal" +set global innodb_stats_method = default; + +select @@innodb_stats_method; + +# create three tables, bug30243_1, bug30243_2 and bug30243_3. +# The test scenario is adopted from original bug #30423 report. +# table bug30243_1 and bug30243_3 have many NULL values + +-- disable_result_log +-- disable_query_log + +DROP TABLE IF EXISTS bug30243_1; +CREATE TABLE bug30243_1 ( + org_id int(11) NOT NULL default '0', + UNIQUE KEY (org_id) +) ENGINE=InnoDB DEFAULT CHARSET=latin1; + +LOCK TABLES bug30243_1 WRITE; +INSERT INTO bug30243_1 VALUES (11),(15),(16),(17),(19),(20),(21),(23),(24), +(25),(26),(27),(28),(29),(30),(31),(32),(33),(34),(35),(37),(38),(40),(41), +(42),(43),(44),(45),(46),(47),(48),(49),(50),(51),(52),(53),(54),(55),(56), +(57),(58),(59),(60),(61),(62),(63),(64),(65),(66),(67),(68),(69),(70),(71), +(72),(73),(74),(75),(76),(77),(78),(79),(80),(81),(82),(83),(84),(85),(86), +(87),(88),(89),(90),(91),(92),(93),(94),(95),(96),(97),(98),(99),(100),(101), +(102),(103),(104),(105),(106),(107),(108),(109),(110),(111),(112),(113),(114), +(115),(116),(117),(118),(119),(120),(121),(122),(123),(124),(125),(126),(127), +(128),(129),(130),(131),(132),(133),(134),(135),(136),(137),(138),(139),(140), +(141),(142),(143),(144),(145); +UNLOCK TABLES; + +DROP TABLE IF EXISTS bug30243_3; +CREATE TABLE bug30243_3 ( + org_id int(11) default NULL, + KEY (org_id) +) ENGINE=InnoDB DEFAULT CHARSET=latin1; + +INSERT INTO bug30243_3 VALUES (NULL); + +begin; +let $i=14; +while ($i) +{ + INSERT INTO bug30243_3 SELECT NULL FROM bug30243_3; + dec $i; +} + +INSERT INTO bug30243_3 VALUES (34),(34),(35),(56),(58),(62),(62),(64),(65),(66),(80),(135),(137),(138),(139),(140),(142),(143),(144),(145); +commit; + +DROP TABLE IF EXISTS bug30243_2; +CREATE TABLE bug30243_2 ( + org_id int(11) default NULL, + KEY `contacts$org_id` (org_id) +) ENGINE=InnoDB DEFAULT CHARSET=latin1; + +INSERT INTO bug30243_2 VALUES (NULL); + +begin; +let $i=16; +while ($i) +{ + INSERT INTO bug30243_2 SELECT NULL FROM bug30243_2; + dec $i; +} + +INSERT INTO bug30243_2 VALUES (11),(15),(16),(17),(20),(21),(23),(24),(25), +(26),(27),(28),(29),(30),(31),(32),(33),(34),(37),(38),(40),(41),(42),(43), +(44),(45),(46),(46),(46),(46),(46),(46),(46),(46),(46),(46),(46),(46),(46), +(46),(46),(46),(46),(46),(46),(46),(46),(46),(46),(46),(46),(46),(46),(46), +(46),(46),(46),(46),(46),(46),(46),(46),(46),(46),(46),(46),(46),(46),(46), +(46),(46),(46),(46),(46),(46),(46),(46),(46),(46),(46),(46),(46),(46),(46), +(46),(46),(46),(46),(46),(46),(46),(46),(46),(46),(46),(46),(46),(46),(46), +(46),(46),(46),(46),(46),(46),(46),(46),(46),(46),(46),(46),(46),(46),(48), +(48),(50),(51),(52),(52),(53),(54),(55),(57),(60),(61),(62),(62),(62),(62), +(62),(63),(64),(64),(65),(66),(66),(67),(68),(69),(70),(71),(72),(73),(74), +(75),(76),(77),(78),(79),(80),(80),(81),(82),(83),(84),(85),(86),(87),(88), +(89),(90),(91),(92),(93),(94),(95),(96),(97),(98),(99),(100),(101),(102), +(103),(104),(105),(106),(107),(108),(109),(110),(111),(112),(113),(114), +(115),(116),(117),(118),(119),(120),(121),(122),(123),(124),(125),(126), +(127),(128),(129),(130),(131),(132),(133),(133),(135),(135),(135),(135), +(136),(136),(138),(138),(139),(139),(139),(140),(141),(141),(142),(143), +(143),(145),(145); +commit; + + +-- enable_result_log +-- enable_query_log + +# check tables's value +select count(*) from bug30243_3 where org_id is not NULL; +select count(*) from bug30243_3 where org_id is NULL; + +select count(*) from bug30243_2 where org_id is not NULL; +select count(*) from bug30243_2 where org_id is NULL; + +select @@innodb_stats_method; + +analyze table bug30243_1; +analyze table bug30243_2; +analyze table bug30243_3; + +# Following query plan shows that we over estimate the rows per +# unique value (since there are many NULLs). +# Skip this query log since the stats estimate could vary from runs +-- disable_query_log +-- disable_result_log +explain SELECT COUNT(*), 0 + FROM bug30243_1 orgs + LEFT JOIN bug30243_3 sa_opportunities + ON orgs.org_id=sa_opportunities.org_id + LEFT JOIN bug30243_2 contacts + ON orgs.org_id=contacts.org_id ; +-- enable_query_log +-- enable_result_log + +# following set operation will fail +#--error ER_WRONG_VALUE_FOR_VAR +--error 1231 +set global innodb_stats_method = "NULL"; + +set global innodb_stats_method = "nulls_ignored"; + +select @@innodb_stats_method; + +# Regenerate the stats with "nulls_ignored" option + +analyze table bug30243_1; +analyze table bug30243_2; +analyze table bug30243_3; + +# Following query plan shows that we get the correct rows per +# unique value (should be approximately 1 row per value) +explain SELECT COUNT(*), 0 + FROM bug30243_1 orgs + LEFT JOIN bug30243_3 sa_opportunities + ON orgs.org_id=sa_opportunities.org_id + LEFT JOIN bug30243_2 contacts + ON orgs.org_id=contacts.org_id ; + +select @@innodb_stats_method; + +# Try the "nulls_unequal" option +set global innodb_stats_method = "nulls_unequal"; + +select @@innodb_stats_method; + +analyze table bug30243_1; +analyze table bug30243_2; +analyze table bug30243_3; + +# Following query plan shows that we get the correct rows per +# unique value (~1) +explain SELECT COUNT(*), 0 + FROM bug30243_1 orgs + LEFT JOIN bug30243_3 sa_opportunities + ON orgs.org_id=sa_opportunities.org_id + LEFT JOIN bug30243_2 contacts + ON orgs.org_id=contacts.org_id; + + +# Create a table with all NULL values, make sure the stats calculation +# does not crash with table of all NULL values +-- disable_query_log +CREATE TABLE table_bug30423 ( + org_id int(11) default NULL, + KEY(org_id) +) ENGINE=InnoDB DEFAULT CHARSET=latin1; + +INSERT INTO `table_bug30423` VALUES (NULL); + +begin; +let $i=10; +while ($i) +{ + INSERT INTO table_bug30423 SELECT NULL FROM table_bug30423; + dec $i; +} +commit; + +-- enable_query_log + +SELECT COUNT(*) FROM table_bug30423 WHERE org_id IS NULL; + +# calculate the statistics for the table for "nulls_ignored" and +# "nulls_unequal" option +set global innodb_stats_method = "nulls_unequal"; +analyze table table_bug30423; + +set global innodb_stats_method = "nulls_ignored"; +analyze table table_bug30423; + + +eval set global innodb_stats_method = $innodb_stats_method_orig; + +drop table bug30243_2; + +drop table bug30243_1; + +drop table bug30243_3; + +drop table table_bug30423; diff --git a/storage/innobase/btr/btr0cur.c b/storage/innobase/btr/btr0cur.c index a7160d74a32..9f4babfaae6 100644 --- a/storage/innobase/btr/btr0cur.c +++ b/storage/innobase/btr/btr0cur.c @@ -66,6 +66,13 @@ this many index pages */ /*--------------------------------------*/ #define BTR_BLOB_HDR_SIZE 8 +/* Estimated table level stats from sampled value. */ +#define BTR_TABLE_STATS_FROM_SAMPLE(value, index, ext_size, not_empty) \ + ((value * (ib_longlong) index->stat_n_leaf_pages \ + + BTR_KEY_VAL_ESTIMATE_N_PAGES - 1 + ext_size \ + + not_empty) \ + / (BTR_KEY_VAL_ESTIMATE_N_PAGES + ext_size)) + /*********************************************************************** Marks all extern fields in a record as owned by the record. This function should be called if the delete mark of a record is removed: a not delete @@ -2834,10 +2841,55 @@ btr_estimate_n_rows_in_range( } } +/*********************************************************************** +Record the number of non_null key values in a given index for +each n-column prefix of the index where n < dict_index_get_n_unique(index). +The estimates are eventually stored in the array: +index->stat_n_non_null_key_vals. */ +static +void +btr_record_not_null_field_in_rec( +/*=============================*/ + rec_t* rec, /* in: physical record */ + ulint n_unique, /* in: dict_index_get_n_unique(index), + number of columns uniquely determine + an index entry */ + const ulint* offsets, /* in: rec_get_offsets(rec, index), + its size could be for all fields or + that of "n_unique" */ + ib_longlong* n_not_null) /* in/out: array to record number of + not null rows for n-column prefix */ +{ + ulint i; + + ut_ad(rec_offs_n_fields(offsets) >= n_unique); + + if (n_not_null == NULL) { + return; + } + + for (i = 0; i < n_unique; i++) { + ulint rec_len; + byte* field; + + field = rec_get_nth_field(rec, offsets, i, &rec_len); + + if (rec_len != UNIV_SQL_NULL) { + n_not_null[i]++; + } else { + /* Break if we hit the first NULL value */ + break; + } + } +} + /*********************************************************************** Estimates the number of different key values in a given index, for each n-column prefix of the index where n <= dict_index_get_n_unique(index). -The estimates are stored in the array index->stat_n_diff_key_vals. */ +The estimates are stored in the array index->stat_n_diff_key_vals. +If innodb_stats_method is "nulls_ignored", we also record the number of +non-null values for each prefix and store the estimates in +array index->stat_n_non_null_key_vals. */ void btr_estimate_number_of_different_key_vals( @@ -2851,6 +2903,8 @@ btr_estimate_number_of_different_key_vals( ulint matched_fields; ulint matched_bytes; ib_longlong* n_diff; + ib_longlong* n_not_null; + ibool stats_null_not_equal; ulint not_empty_flag = 0; ulint total_external_size = 0; ulint i; @@ -2858,24 +2912,47 @@ btr_estimate_number_of_different_key_vals( ulint add_on; mtr_t mtr; mem_heap_t* heap = NULL; - ulint offsets_rec_[REC_OFFS_NORMAL_SIZE]; - ulint offsets_next_rec_[REC_OFFS_NORMAL_SIZE]; - ulint* offsets_rec = offsets_rec_; - ulint* offsets_next_rec= offsets_next_rec_; - *offsets_rec_ = (sizeof offsets_rec_) / sizeof *offsets_rec_; - *offsets_next_rec_ - = (sizeof offsets_next_rec_) / sizeof *offsets_next_rec_; + ulint* offsets_rec = NULL; + ulint* offsets_next_rec = NULL; n_cols = dict_index_get_n_unique(index); - n_diff = mem_alloc((n_cols + 1) * sizeof(ib_longlong)); + heap = mem_heap_create((sizeof *n_diff + sizeof *n_not_null) + * (n_cols + 1) + + dict_index_get_n_fields(index) + * (sizeof *offsets_rec + + sizeof *offsets_next_rec)); - memset(n_diff, 0, (n_cols + 1) * sizeof(ib_longlong)); + n_diff = mem_heap_zalloc(heap, (n_cols + 1) * sizeof(ib_longlong)); + + n_not_null = NULL; + + /* Check srv_innodb_stats_method setting, and decide whether we + need to record non-null value and also decide if NULL is + considered equal (by setting stats_null_not_equal value) */ + switch (srv_innodb_stats_method) { + case SRV_STATS_NULLS_IGNORED: + n_not_null = mem_heap_zalloc(heap, (n_cols + 1) + * sizeof *n_not_null); + /* fall through */ + + case SRV_STATS_NULLS_UNEQUAL: + /* for both SRV_STATS_NULLS_IGNORED and SRV_STATS_NULLS_UNEQUAL + case, we will treat NULLs as unequal value */ + stats_null_not_equal = TRUE; + break; + + case SRV_STATS_NULLS_EQUAL: + stats_null_not_equal = FALSE; + break; + + default: + ut_error; + } /* We sample some pages in the index to get an estimate */ for (i = 0; i < BTR_KEY_VAL_ESTIMATE_N_PAGES; i++) { - rec_t* supremum; mtr_start(&mtr); btr_cur_open_at_rnd_pos(index, BTR_SEARCH_LEAF, &cursor, &mtr); @@ -2888,18 +2965,22 @@ btr_estimate_number_of_different_key_vals( page = btr_cur_get_page(&cursor); - supremum = page_get_supremum_rec(page); rec = page_rec_get_next(page_get_infimum_rec(page)); - if (rec != supremum) { + if (!page_rec_is_supremum(rec)) { not_empty_flag = 1; offsets_rec = rec_get_offsets(rec, index, offsets_rec, ULINT_UNDEFINED, &heap); + + if (n_not_null) { + btr_record_not_null_field_in_rec( + rec, n_cols, offsets_rec, n_not_null); + } } - while (rec != supremum) { + while (!page_rec_is_supremum(rec)) { rec_t* next_rec = page_rec_get_next(rec); - if (next_rec == supremum) { + if (page_rec_is_supremum(next_rec)) { break; } @@ -2911,7 +2992,8 @@ btr_estimate_number_of_different_key_vals( cmp_rec_rec_with_match(rec, next_rec, offsets_rec, offsets_next_rec, - index, &matched_fields, + index, stats_null_not_equal, + &matched_fields, &matched_bytes); for (j = matched_fields + 1; j <= n_cols; j++) { @@ -2921,6 +3003,12 @@ btr_estimate_number_of_different_key_vals( n_diff[j]++; } + if (n_not_null) { + btr_record_not_null_field_in_rec( + next_rec, n_cols, offsets_next_rec, + n_not_null); + } + total_external_size += btr_rec_get_externally_stored_len( rec, offsets_rec); @@ -2971,14 +3059,8 @@ btr_estimate_number_of_different_key_vals( included in index->stat_n_leaf_pages) */ for (j = 0; j <= n_cols; j++) { - index->stat_n_diff_key_vals[j] - = ((n_diff[j] - * (ib_longlong)index->stat_n_leaf_pages - + BTR_KEY_VAL_ESTIMATE_N_PAGES - 1 - + total_external_size - + not_empty_flag) - / (BTR_KEY_VAL_ESTIMATE_N_PAGES - + total_external_size)); + index->stat_n_diff_key_vals[j] = BTR_TABLE_STATS_FROM_SAMPLE( + n_diff[j], index, total_external_size, not_empty_flag); /* If the tree is small, smaller than 10 * BTR_KEY_VAL_ESTIMATE_N_PAGES + total_external_size, then @@ -2997,12 +3079,20 @@ btr_estimate_number_of_different_key_vals( } index->stat_n_diff_key_vals[j] += add_on; + + /* Update the stat_n_non_null_key_vals[] with our + sampled result. stat_n_non_null_key_vals[] is created + and initialized to zero in dict_index_add_to_cache(), + along with stat_n_diff_key_vals[] array */ + if (n_not_null != NULL && (j < n_cols)) { + index->stat_n_non_null_key_vals[j] = + BTR_TABLE_STATS_FROM_SAMPLE( + n_not_null[j], index, + total_external_size, not_empty_flag); + } } - mem_free(n_diff); - if (UNIV_LIKELY_NULL(heap)) { - mem_heap_free(heap); - } + mem_heap_free(heap); } /*================== EXTERNAL STORAGE OF BIG FIELDS ===================*/ diff --git a/storage/innobase/dict/dict0dict.c b/storage/innobase/dict/dict0dict.c index fda6555e082..beea0a2f411 100644 --- a/storage/innobase/dict/dict0dict.c +++ b/storage/innobase/dict/dict0dict.c @@ -1358,6 +1358,12 @@ dict_index_add_to_cache( new_index->heap, (1 + dict_index_get_n_unique(new_index)) * sizeof(ib_longlong)); + + new_index->stat_n_non_null_key_vals = mem_heap_zalloc( + new_index->heap, + (1 + dict_index_get_n_unique(new_index)) + * sizeof(*new_index->stat_n_non_null_key_vals)); + /* Give some sensible values to stat_n_... in case we do not calculate statistics quickly enough */ @@ -3817,6 +3823,10 @@ dict_update_statistics_low( for (i = dict_index_get_n_unique(index); i; ) { index->stat_n_diff_key_vals[i--] = 1; } + + memset(index->stat_n_non_null_key_vals, 0, + (1 + dict_index_get_n_unique(index)) + * sizeof(*index->stat_n_non_null_key_vals)); } index = dict_table_get_next_index(index); diff --git a/storage/innobase/handler/ha_innodb.cc b/storage/innobase/handler/ha_innodb.cc index 4c52326a58a..6f58fd70fbd 100644 --- a/storage/innobase/handler/ha_innodb.cc +++ b/storage/innobase/handler/ha_innodb.cc @@ -130,6 +130,25 @@ static my_bool innobase_adaptive_hash_index = TRUE; static char* internal_innobase_data_file_path = NULL; +/* Possible values for system variable "innodb_stats_method". The values +are defined the same as its corresponding MyISAM system variable +"myisam_stats_method"(see "myisam_stats_method_names"), for better usability */ +static const char* innodb_stats_method_names[] = { + "nulls_equal", + "nulls_unequal", + "nulls_ignored", + NullS +}; + +/* Used to define an enumerate type of the system variable innodb_stats_method. +This is the same as "myisam_stats_method_typelib" */ +static TYPELIB innodb_stats_method_typelib = { + array_elements(innodb_stats_method_names) - 1, + "innodb_stats_method_typelib", + innodb_stats_method_names, + NULL +}; + /* The following counter is used to convey information to InnoDB about server activity: in selects it is not sensible to call srv_active_wake_master_thread after each fetch or search, we only do @@ -6362,6 +6381,65 @@ ha_innobase::read_time( return(ranges + (double) rows / (double) total_rows * time_for_scan); } +/************************************************************************* +Calculate Record Per Key value. Need to exclude the NULL value if +innodb_stats_method is set to "nulls_ignored" */ +static +ha_rows +innodb_rec_per_key( +/*===============*/ + /* out: estimated record per key + value */ + dict_index_t* index, /* in: dict_index_t structure */ + ulint i, /* in: the column we are + calculating rec per key */ + ha_rows records) /* in: estimated total records */ +{ + ha_rows rec_per_key; + + ut_ad(i < dict_index_get_n_unique(index)); + + /* Note the stat_n_diff_key_vals[] stores the diff value with + n-prefix indexing, so it is always stat_n_diff_key_vals[i + 1] */ + if (index->stat_n_diff_key_vals[i + 1] == 0) { + + rec_per_key = records; + } else if (srv_innodb_stats_method == SRV_STATS_NULLS_IGNORED) { + ib_longlong num_null; + + /* Number of rows with NULL value in this + field */ + num_null = records - index->stat_n_non_null_key_vals[i]; + + /* In theory, index->stat_n_non_null_key_vals[i] + should always be less than the number of records. + Since this is statistics value, the value could + have slight discrepancy. But we will make sure + the number of null values is not a negative number. */ + num_null = (num_null < 0) ? 0 : num_null; + + /* If the number of NULL values is the same as or + large than that of the distinct values, we could + consider that the table consists mostly of NULL value. + Set rec_per_key to 1. */ + if (index->stat_n_diff_key_vals[i + 1] <= num_null) { + rec_per_key = 1; + } else { + /* Need to exclude rows with NULL values from + rec_per_key calculation */ + rec_per_key = (ha_rows)( + (records - num_null) + / (index->stat_n_diff_key_vals[i + 1] + - num_null)); + } + } else { + rec_per_key = (ha_rows) + (records / index->stat_n_diff_key_vals[i + 1]); + } + + return(rec_per_key); +} + /************************************************************************* Returns statistics information of the table to the MySQL interpreter, in various fields of the handle object. */ @@ -6568,13 +6646,8 @@ ha_innobase::info_low( break; } - if (index->stat_n_diff_key_vals[j + 1] == 0) { - - rec_per_key = stats.records; - } else { - rec_per_key = (ha_rows)(stats.records / - index->stat_n_diff_key_vals[j + 1]); - } + rec_per_key = innodb_rec_per_key( + index, j, stats.records); /* Since MySQL seems to favor table scans too much over index searches, we pretend @@ -8990,6 +9063,13 @@ static MYSQL_SYSVAR_LONG(autoinc_lock_mode, innobase_autoinc_lock_mode, AUTOINC_OLD_STYLE_LOCKING, /* Minimum value */ AUTOINC_NO_LOCKING, 0); /* Maximum value */ +static MYSQL_SYSVAR_ENUM(stats_method, srv_innodb_stats_method, + PLUGIN_VAR_RQCMDARG, + "Specifies how InnoDB index statistics collection code should " + "treat NULLs. Possible values are NULLS_EQUAL (default), " + "NULLS_UNEQUAL and NULLS_IGNORED", + NULL, NULL, SRV_STATS_NULLS_EQUAL, &innodb_stats_method_typelib); + #if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG static MYSQL_SYSVAR_UINT(change_buffering_debug, ibuf_debug, PLUGIN_VAR_RQCMDARG, @@ -9031,6 +9111,7 @@ static struct st_mysql_sys_var* innobase_system_variables[]= { MYSQL_SYSVAR(stats_on_metadata), MYSQL_SYSVAR(use_legacy_cardinality_algorithm), MYSQL_SYSVAR(adaptive_hash_index), + MYSQL_SYSVAR(stats_method), MYSQL_SYSVAR(status_file), MYSQL_SYSVAR(support_xa), MYSQL_SYSVAR(sync_spin_loops), diff --git a/storage/innobase/include/btr0cur.h b/storage/innobase/include/btr0cur.h index 213dcb7f568..20235c55f22 100644 --- a/storage/innobase/include/btr0cur.h +++ b/storage/innobase/include/btr0cur.h @@ -404,7 +404,10 @@ btr_estimate_n_rows_in_range( /*********************************************************************** Estimates the number of different key values in a given index, for each n-column prefix of the index where n <= dict_index_get_n_unique(index). -The estimates are stored in the array index->stat_n_diff_key_vals. */ +The estimates are stored in the array index->stat_n_diff_key_vals. +If innodb_stats_method is nulls_ignored, we also record the number of +non-null values for each prefix and stored the estimates in +array index->stat_n_non_null_key_vals. */ void btr_estimate_number_of_different_key_vals( diff --git a/storage/innobase/include/dict0mem.h b/storage/innobase/include/dict0mem.h index 2f2a7441478..83dbf65ea41 100644 --- a/storage/innobase/include/dict0mem.h +++ b/storage/innobase/include/dict0mem.h @@ -222,6 +222,12 @@ struct dict_index_struct{ for this index, for each n-column prefix where n <= dict_get_n_unique(index); we periodically calculate new estimates */ + ib_longlong* stat_n_non_null_key_vals; + /* approximate number of non-null key values + for this index, for each column where + n < dict_get_n_unique(index); This + is used when innodb_stats_method is + "nulls_ignored". */ ulint stat_index_size; /* approximate index size in database pages */ ulint stat_n_leaf_pages; diff --git a/storage/innobase/include/rem0cmp.h b/storage/innobase/include/rem0cmp.h index c6a6e5de4db..22a22d13e17 100644 --- a/storage/innobase/include/rem0cmp.h +++ b/storage/innobase/include/rem0cmp.h @@ -141,6 +141,10 @@ cmp_rec_rec_with_match( const ulint* offsets1,/* in: rec_get_offsets(rec1, index) */ const ulint* offsets2,/* in: rec_get_offsets(rec2, index) */ dict_index_t* index, /* in: data dictionary index */ + ibool nulls_unequal, + /* in: TRUE if this is for index statistics + cardinality estimation, and innodb_stats_method + is "nulls_unequal" or "nulls_ignored" */ ulint* matched_fields, /* in/out: number of already completely matched fields; when the function returns, contains the value the for current diff --git a/storage/innobase/include/rem0cmp.ic b/storage/innobase/include/rem0cmp.ic index 52dc7ff5dc9..45e12301a3c 100644 --- a/storage/innobase/include/rem0cmp.ic +++ b/storage/innobase/include/rem0cmp.ic @@ -72,5 +72,5 @@ cmp_rec_rec( ulint match_b = 0; return(cmp_rec_rec_with_match(rec1, rec2, offsets1, offsets2, index, - &match_f, &match_b)); + FALSE, &match_f, &match_b)); } diff --git a/storage/innobase/include/srv0srv.h b/storage/innobase/include/srv0srv.h index 3dd4bb961f9..811074b2be8 100644 --- a/storage/innobase/include/srv0srv.h +++ b/storage/innobase/include/srv0srv.h @@ -91,6 +91,11 @@ extern ulint srv_lock_table_size; extern ulint srv_n_file_io_threads; +/* The "innodb_stats_method" setting, decides how InnoDB is going +to treat NULL value when collecting statistics. It is not defined +as enum type because the configure option takes unsigned integer type. */ +extern ulong srv_innodb_stats_method; + #ifdef UNIV_LOG_ARCHIVE extern ibool srv_log_archive_on; extern ibool srv_archive_recovery; @@ -286,6 +291,19 @@ of lower numbers are included. */ #define SRV_FORCE_NO_LOG_REDO 6 /* do not do the log roll-forward in connection with recovery */ +/* Alternatives for srv_innodb_stats_method, which could be changed by +setting innodb_stats_method */ +enum srv_stats_method_name_enum { + SRV_STATS_NULLS_EQUAL, /* All NULL values are treated as + equal. This is the default setting + for innodb_stats_method */ + SRV_STATS_NULLS_UNEQUAL, /* All NULL values are treated as + NOT equal. */ + SRV_STATS_NULLS_IGNORED /* NULL values are ignored */ +}; + +typedef enum srv_stats_method_name_enum srv_stats_method_name_t; + /************************************************************************* Boots Innobase server. */ diff --git a/storage/innobase/rem/rem0cmp.c b/storage/innobase/rem/rem0cmp.c index ca0ec663548..2939c119e2e 100644 --- a/storage/innobase/rem/rem0cmp.c +++ b/storage/innobase/rem/rem0cmp.c @@ -720,6 +720,10 @@ cmp_rec_rec_with_match( const ulint* offsets1,/* in: rec_get_offsets(rec1, index) */ const ulint* offsets2,/* in: rec_get_offsets(rec2, index) */ dict_index_t* index, /* in: data dictionary index */ + ibool nulls_unequal, + /* in: TRUE if this is for index statistics + cardinality estimation, and innodb_stats_method + is "nulls_unequal" or "nulls_ignored" */ ulint* matched_fields, /* in/out: number of already completely matched fields; when the function returns, contains the value the for current @@ -821,9 +825,13 @@ cmp_rec_rec_with_match( || rec2_f_len == UNIV_SQL_NULL) { if (rec1_f_len == rec2_f_len) { - - goto next_field; - + /* This is limited to stats collection, + cannot use it for regular search */ + if (nulls_unequal) { + ret = -1; + } else { + goto next_field; + } } else if (rec2_f_len == UNIV_SQL_NULL) { /* We define the SQL null to be the diff --git a/storage/innobase/srv/srv0srv.c b/storage/innobase/srv/srv0srv.c index 5b1184fb416..9c34e73109c 100644 --- a/storage/innobase/srv/srv0srv.c +++ b/storage/innobase/srv/srv0srv.c @@ -218,6 +218,11 @@ ulong srv_max_buf_pool_modified_pct = 90; /* variable counts amount of data read in total (in bytes) */ ulint srv_data_read = 0; +/* Internal setting for "innodb_stats_method". Decides how InnoDB treats +NULL value when collecting statistics. By default, it is set to +SRV_STATS_NULLS_EQUAL(0), ie. all NULL value are treated equal */ +ulong srv_innodb_stats_method = SRV_STATS_NULLS_EQUAL; + /* here we count the amount of data written in total (in bytes) */ ulint srv_data_written = 0; diff --git a/storage/innodb_plugin/ChangeLog b/storage/innodb_plugin/ChangeLog index 8eb63fe8c78..43ffa762ddb 100644 --- a/storage/innodb_plugin/ChangeLog +++ b/storage/innodb_plugin/ChangeLog @@ -1,3 +1,11 @@ +2011-01-14 The InnoDB Team + * btr/btr0cur.c, dict/dict0dict.c, handler/ha_innodb.cc, + include/btr0cur.h, include/dict0mem.h, include/rem0cmp.h, + include/rem0cmp.ic, include/srv0srv.h, rem/rem0cmp.c, + srv/srv0srv.c, innodb_bug30423.test: + Fix Bug#30423 InnoDBs treatment of NULL in index stats causes + bad "rows examined" estimates + 2011-01-06 The InnoDB Team * handler/i_s.cc, include/trx0i_s.h, trx/trx0i_s.c: Fix Bug#55397 cannot select from innodb_trx when trx_query contains diff --git a/storage/innodb_plugin/btr/btr0cur.c b/storage/innodb_plugin/btr/btr0cur.c index c57255a25ae..1fb0bc39933 100644 --- a/storage/innodb_plugin/btr/btr0cur.c +++ b/storage/innodb_plugin/btr/btr0cur.c @@ -100,6 +100,18 @@ can be released by page reorganize, then it is reorganized */ /*--------------------------------------*/ #define BTR_BLOB_HDR_SIZE 8 /*!< Size of a BLOB part header, in bytes */ + +/** Estimated table level stats from sampled value. +@param value sampled stats +@param index index being sampled +@param sample number of sampled rows +@param ext_size external stored data size +@param not_empty table not empty +@return estimated table wide stats from sampled value */ +#define BTR_TABLE_STATS_FROM_SAMPLE(value, index, sample, ext_size, not_empty)\ + (((value) * (ib_int64_t) index->stat_n_leaf_pages \ + + (sample) - 1 + (ext_size) + (not_empty)) / ((sample) + (ext_size))) + /* @} */ #endif /* !UNIV_HOTBACKUP */ @@ -3200,10 +3212,55 @@ btr_estimate_n_rows_in_range( } } +/*******************************************************************//** +Record the number of non_null key values in a given index for +each n-column prefix of the index where n < dict_index_get_n_unique(index). +The estimates are eventually stored in the array: +index->stat_n_non_null_key_vals. */ +static +void +btr_record_not_null_field_in_rec( +/*=============================*/ + rec_t* rec, /*!< in: physical record */ + ulint n_unique, /*!< in: dict_index_get_n_unique(index), + number of columns uniquely determine + an index entry */ + const ulint* offsets, /*!< in: rec_get_offsets(rec, index), + its size could be for all fields or + that of "n_unique" */ + ib_int64_t* n_not_null) /*!< in/out: array to record number of + not null rows for n-column prefix */ +{ + ulint i; + + ut_ad(rec_offs_n_fields(offsets) >= n_unique); + + if (n_not_null == NULL) { + return; + } + + for (i = 0; i < n_unique; i++) { + ulint rec_len; + byte* field; + + field = rec_get_nth_field(rec, offsets, i, &rec_len); + + if (rec_len != UNIV_SQL_NULL) { + n_not_null[i]++; + } else { + /* Break if we hit the first NULL value */ + break; + } + } +} + /*******************************************************************//** Estimates the number of different key values in a given index, for each n-column prefix of the index where n <= dict_index_get_n_unique(index). -The estimates are stored in the array index->stat_n_diff_key_vals. */ +The estimates are stored in the array index->stat_n_diff_key_vals. +If innodb_stats_method is "nulls_ignored", we also record the number of +non-null values for each prefix and store the estimates in +array index->stat_n_non_null_key_vals. */ UNIV_INTERN void btr_estimate_number_of_different_key_vals( @@ -3217,6 +3274,8 @@ btr_estimate_number_of_different_key_vals( ulint matched_fields; ulint matched_bytes; ib_int64_t* n_diff; + ib_int64_t* n_not_null; + ibool stats_null_not_equal; ullint n_sample_pages; /* number of pages to sample */ ulint not_empty_flag = 0; ulint total_external_size = 0; @@ -3225,16 +3284,43 @@ btr_estimate_number_of_different_key_vals( ullint add_on; mtr_t mtr; mem_heap_t* heap = NULL; - ulint offsets_rec_[REC_OFFS_NORMAL_SIZE]; - ulint offsets_next_rec_[REC_OFFS_NORMAL_SIZE]; - ulint* offsets_rec = offsets_rec_; - ulint* offsets_next_rec= offsets_next_rec_; - rec_offs_init(offsets_rec_); - rec_offs_init(offsets_next_rec_); + ulint* offsets_rec = NULL; + ulint* offsets_next_rec = NULL; n_cols = dict_index_get_n_unique(index); - n_diff = mem_zalloc((n_cols + 1) * sizeof(ib_int64_t)); + heap = mem_heap_create((sizeof *n_diff + sizeof *n_not_null) + * (n_cols + 1) + + dict_index_get_n_fields(index) + * (sizeof *offsets_rec + + sizeof *offsets_next_rec)); + + n_diff = mem_heap_zalloc(heap, (n_cols + 1) * sizeof(ib_int64_t)); + + n_not_null = NULL; + + /* Check srv_innodb_stats_method setting, and decide whether we + need to record non-null value and also decide if NULL is + considered equal (by setting stats_null_not_equal value) */ + switch (srv_innodb_stats_method) { + case SRV_STATS_NULLS_IGNORED: + n_not_null = mem_heap_zalloc(heap, (n_cols + 1) + * sizeof *n_not_null); + /* fall through */ + + case SRV_STATS_NULLS_UNEQUAL: + /* for both SRV_STATS_NULLS_IGNORED and SRV_STATS_NULLS_UNEQUAL + case, we will treat NULLs as unequal value */ + stats_null_not_equal = TRUE; + break; + + case SRV_STATS_NULLS_EQUAL: + stats_null_not_equal = FALSE; + break; + + default: + ut_error; + } /* It makes no sense to test more pages than are contained in the index, thus we lower the number if it is too high */ @@ -3251,7 +3337,6 @@ btr_estimate_number_of_different_key_vals( /* We sample some pages in the index to get an estimate */ for (i = 0; i < n_sample_pages; i++) { - rec_t* supremum; mtr_start(&mtr); btr_cur_open_at_rnd_pos(index, BTR_SEARCH_LEAF, &cursor, &mtr); @@ -3264,18 +3349,22 @@ btr_estimate_number_of_different_key_vals( page = btr_cur_get_page(&cursor); - supremum = page_get_supremum_rec(page); rec = page_rec_get_next(page_get_infimum_rec(page)); - if (rec != supremum) { + if (!page_rec_is_supremum(rec)) { not_empty_flag = 1; offsets_rec = rec_get_offsets(rec, index, offsets_rec, ULINT_UNDEFINED, &heap); + + if (n_not_null) { + btr_record_not_null_field_in_rec( + rec, n_cols, offsets_rec, n_not_null); + } } - while (rec != supremum) { + while (!page_rec_is_supremum(rec)) { rec_t* next_rec = page_rec_get_next(rec); - if (next_rec == supremum) { + if (page_rec_is_supremum(next_rec)) { break; } @@ -3287,7 +3376,8 @@ btr_estimate_number_of_different_key_vals( cmp_rec_rec_with_match(rec, next_rec, offsets_rec, offsets_next_rec, - index, &matched_fields, + index, stats_null_not_equal, + &matched_fields, &matched_bytes); for (j = matched_fields + 1; j <= n_cols; j++) { @@ -3297,6 +3387,12 @@ btr_estimate_number_of_different_key_vals( n_diff[j]++; } + if (n_not_null) { + btr_record_not_null_field_in_rec( + next_rec, n_cols, offsets_next_rec, + n_not_null); + } + total_external_size += btr_rec_get_externally_stored_len( rec, offsets_rec); @@ -3348,13 +3444,9 @@ btr_estimate_number_of_different_key_vals( for (j = 0; j <= n_cols; j++) { index->stat_n_diff_key_vals[j] - = ((n_diff[j] - * (ib_int64_t)index->stat_n_leaf_pages - + n_sample_pages - 1 - + total_external_size - + not_empty_flag) - / (n_sample_pages - + total_external_size)); + = BTR_TABLE_STATS_FROM_SAMPLE( + n_diff[j], index, n_sample_pages, + total_external_size, not_empty_flag); /* If the tree is small, smaller than 10 * n_sample_pages + total_external_size, then @@ -3373,12 +3465,20 @@ btr_estimate_number_of_different_key_vals( } index->stat_n_diff_key_vals[j] += add_on; + + /* Update the stat_n_non_null_key_vals[] with our + sampled result. stat_n_non_null_key_vals[] is created + and initialized to zero in dict_index_add_to_cache(), + along with stat_n_diff_key_vals[] array */ + if (n_not_null != NULL && (j < n_cols)) { + index->stat_n_non_null_key_vals[j] = + BTR_TABLE_STATS_FROM_SAMPLE( + n_not_null[j], index, n_sample_pages, + total_external_size, not_empty_flag); + } } - mem_free(n_diff); - if (UNIV_LIKELY_NULL(heap)) { - mem_heap_free(heap); - } + mem_heap_free(heap); } /*================== EXTERNAL STORAGE OF BIG FIELDS ===================*/ diff --git a/storage/innodb_plugin/dict/dict0dict.c b/storage/innodb_plugin/dict/dict0dict.c index 67765555658..ff56e9cb76a 100644 --- a/storage/innodb_plugin/dict/dict0dict.c +++ b/storage/innodb_plugin/dict/dict0dict.c @@ -1669,6 +1669,12 @@ undo_size_ok: new_index->heap, (1 + dict_index_get_n_unique(new_index)) * sizeof(ib_int64_t)); + + new_index->stat_n_non_null_key_vals = mem_heap_zalloc( + new_index->heap, + (1 + dict_index_get_n_unique(new_index)) + * sizeof(*new_index->stat_n_non_null_key_vals)); + /* Give some sensible values to stat_n_... in case we do not calculate statistics quickly enough */ @@ -4291,6 +4297,10 @@ dict_update_statistics( for (i = dict_index_get_n_unique(index); i; ) { index->stat_n_diff_key_vals[i--] = 1; } + + memset(index->stat_n_non_null_key_vals, 0, + (1 + dict_index_get_n_unique(index)) + * sizeof(*index->stat_n_non_null_key_vals)); } index = dict_table_get_next_index(index); diff --git a/storage/innodb_plugin/handler/ha_innodb.cc b/storage/innodb_plugin/handler/ha_innodb.cc index 86168e2bc9b..2d60c7397b0 100644 --- a/storage/innodb_plugin/handler/ha_innodb.cc +++ b/storage/innodb_plugin/handler/ha_innodb.cc @@ -174,6 +174,25 @@ static char* internal_innobase_data_file_path = NULL; static char* innodb_version_str = (char*) INNODB_VERSION_STR; +/** Possible values for system variable "innodb_stats_method". The values +are defined the same as its corresponding MyISAM system variable +"myisam_stats_method"(see "myisam_stats_method_names"), for better usability */ +static const char* innodb_stats_method_names[] = { + "nulls_equal", + "nulls_unequal", + "nulls_ignored", + NullS +}; + +/** Used to define an enumerate type of the system variable innodb_stats_method. +This is the same as "myisam_stats_method_typelib" */ +static TYPELIB innodb_stats_method_typelib = { + array_elements(innodb_stats_method_names) - 1, + "innodb_stats_method_typelib", + innodb_stats_method_names, + NULL +}; + /* The following counter is used to convey information to InnoDB about server activity: in selects it is not sensible to call srv_active_wake_master_thread after each fetch or search, we only do @@ -7507,6 +7526,65 @@ innobase_get_mysql_key_number_for_index( return(0); } + +/*********************************************************************//** +Calculate Record Per Key value. Need to exclude the NULL value if +innodb_stats_method is set to "nulls_ignored" +@return estimated record per key value */ +static +ha_rows +innodb_rec_per_key( +/*===============*/ + dict_index_t* index, /*!< in: dict_index_t structure */ + ulint i, /*!< in: the column we are + calculating rec per key */ + ha_rows records) /*!< in: estimated total records */ +{ + ha_rows rec_per_key; + + ut_ad(i < dict_index_get_n_unique(index)); + + /* Note the stat_n_diff_key_vals[] stores the diff value with + n-prefix indexing, so it is always stat_n_diff_key_vals[i + 1] */ + if (index->stat_n_diff_key_vals[i + 1] == 0) { + + rec_per_key = records; + } else if (srv_innodb_stats_method == SRV_STATS_NULLS_IGNORED) { + ib_int64_t num_null; + + /* Number of rows with NULL value in this + field */ + num_null = records - index->stat_n_non_null_key_vals[i]; + + /* In theory, index->stat_n_non_null_key_vals[i] + should always be less than the number of records. + Since this is statistics value, the value could + have slight discrepancy. But we will make sure + the number of null values is not a negative number. */ + num_null = (num_null < 0) ? 0 : num_null; + + /* If the number of NULL values is the same as or + large than that of the distinct values, we could + consider that the table consists mostly of NULL value. + Set rec_per_key to 1. */ + if (index->stat_n_diff_key_vals[i + 1] <= num_null) { + rec_per_key = 1; + } else { + /* Need to exclude rows with NULL values from + rec_per_key calculation */ + rec_per_key = (ha_rows)( + (records - num_null) + / (index->stat_n_diff_key_vals[i + 1] + - num_null)); + } + } else { + rec_per_key = (ha_rows) + (records / index->stat_n_diff_key_vals[i + 1]); + } + + return(rec_per_key); +} + /*********************************************************************//** Returns statistics information of the table to the MySQL interpreter, in various fields of the handle object. */ @@ -7737,13 +7815,8 @@ ha_innobase::info_low( break; } - if (index->stat_n_diff_key_vals[j + 1] == 0) { - - rec_per_key = stats.records; - } else { - rec_per_key = (ha_rows)(stats.records / - index->stat_n_diff_key_vals[j + 1]); - } + rec_per_key = innodb_rec_per_key( + index, j, stats.records); /* Since MySQL seems to favor table scans too much over index searches, we pretend @@ -10934,6 +11007,13 @@ static MYSQL_SYSVAR_STR(change_buffering, innobase_change_buffering, innodb_change_buffering_validate, innodb_change_buffering_update, "inserts"); +static MYSQL_SYSVAR_ENUM(stats_method, srv_innodb_stats_method, + PLUGIN_VAR_RQCMDARG, + "Specifies how InnoDB index statistics collection code should " + "treat NULLs. Possible values are NULLS_EQUAL (default), " + "NULLS_UNEQUAL and NULLS_IGNORED", + NULL, NULL, SRV_STATS_NULLS_EQUAL, &innodb_stats_method_typelib); + #if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG static MYSQL_SYSVAR_UINT(change_buffering_debug, ibuf_debug, PLUGIN_VAR_RQCMDARG, @@ -10988,6 +11068,7 @@ static struct st_mysql_sys_var* innobase_system_variables[]= { MYSQL_SYSVAR(stats_on_metadata), MYSQL_SYSVAR(stats_sample_pages), MYSQL_SYSVAR(adaptive_hash_index), + MYSQL_SYSVAR(stats_method), MYSQL_SYSVAR(replication_delay), MYSQL_SYSVAR(status_file), MYSQL_SYSVAR(strict_mode), diff --git a/storage/innodb_plugin/include/btr0cur.h b/storage/innodb_plugin/include/btr0cur.h index b477ad0320a..cb8cb399715 100644 --- a/storage/innodb_plugin/include/btr0cur.h +++ b/storage/innodb_plugin/include/btr0cur.h @@ -478,7 +478,10 @@ btr_estimate_n_rows_in_range( /*******************************************************************//** Estimates the number of different key values in a given index, for each n-column prefix of the index where n <= dict_index_get_n_unique(index). -The estimates are stored in the array index->stat_n_diff_key_vals. */ +The estimates are stored in the array index->stat_n_diff_key_vals. +If innodb_stats_method is nulls_ignored, we also record the number of +non-null values for each prefix and stored the estimates in +array index->stat_n_non_null_key_vals. */ UNIV_INTERN void btr_estimate_number_of_different_key_vals( diff --git a/storage/innodb_plugin/include/dict0mem.h b/storage/innodb_plugin/include/dict0mem.h index 19782c2e76a..09a068ccb93 100644 --- a/storage/innodb_plugin/include/dict0mem.h +++ b/storage/innodb_plugin/include/dict0mem.h @@ -321,6 +321,12 @@ struct dict_index_struct{ dict_get_n_unique(index); we periodically calculate new estimates */ + ib_int64_t* stat_n_non_null_key_vals; + /* approximate number of non-null key values + for this index, for each column where + n < dict_get_n_unique(index); This + is used when innodb_stats_method is + "nulls_ignored". */ ulint stat_index_size; /*!< approximate index size in database pages */ diff --git a/storage/innodb_plugin/include/rem0cmp.h b/storage/innodb_plugin/include/rem0cmp.h index 2f751a38864..a908521c9f7 100644 --- a/storage/innodb_plugin/include/rem0cmp.h +++ b/storage/innodb_plugin/include/rem0cmp.h @@ -165,6 +165,10 @@ cmp_rec_rec_with_match( const ulint* offsets1,/*!< in: rec_get_offsets(rec1, index) */ const ulint* offsets2,/*!< in: rec_get_offsets(rec2, index) */ dict_index_t* index, /*!< in: data dictionary index */ + ibool nulls_unequal, + /* in: TRUE if this is for index statistics + cardinality estimation, and innodb_stats_method + is "nulls_unequal" or "nulls_ignored" */ ulint* matched_fields, /*!< in/out: number of already completely matched fields; when the function returns, contains the value the for current diff --git a/storage/innodb_plugin/include/rem0cmp.ic b/storage/innodb_plugin/include/rem0cmp.ic index 39ef5f4fba3..63415fe7837 100644 --- a/storage/innodb_plugin/include/rem0cmp.ic +++ b/storage/innodb_plugin/include/rem0cmp.ic @@ -87,5 +87,5 @@ cmp_rec_rec( ulint match_b = 0; return(cmp_rec_rec_with_match(rec1, rec2, offsets1, offsets2, index, - &match_f, &match_b)); + FALSE, &match_f, &match_b)); } diff --git a/storage/innodb_plugin/include/srv0srv.h b/storage/innodb_plugin/include/srv0srv.h index 7aa2ce74720..91ae895040c 100644 --- a/storage/innodb_plugin/include/srv0srv.h +++ b/storage/innodb_plugin/include/srv0srv.h @@ -154,6 +154,11 @@ capacity. PCT_IO(5) -> returns the number of IO operations that is 5% of the max where max is srv_io_capacity. */ #define PCT_IO(p) ((ulong) (srv_io_capacity * ((double) p / 100.0))) +/* The "innodb_stats_method" setting, decides how InnoDB is going +to treat NULL value when collecting statistics. It is not defined +as enum type because the configure option takes unsigned integer type. */ +extern ulong srv_innodb_stats_method; + #ifdef UNIV_LOG_ARCHIVE extern ibool srv_log_archive_on; extern ibool srv_archive_recovery; @@ -363,6 +368,19 @@ enum { in connection with recovery */ }; +/* Alternatives for srv_innodb_stats_method, which could be changed by +setting innodb_stats_method */ +enum srv_stats_method_name_enum { + SRV_STATS_NULLS_EQUAL, /* All NULL values are treated as + equal. This is the default setting + for innodb_stats_method */ + SRV_STATS_NULLS_UNEQUAL, /* All NULL values are treated as + NOT equal. */ + SRV_STATS_NULLS_IGNORED /* NULL values are ignored */ +}; + +typedef enum srv_stats_method_name_enum srv_stats_method_name_t; + #ifndef UNIV_HOTBACKUP /** Types of threads existing in the system. */ enum srv_thread_type { diff --git a/storage/innodb_plugin/rem/rem0cmp.c b/storage/innodb_plugin/rem/rem0cmp.c index 35b67992558..04d2c15437b 100644 --- a/storage/innodb_plugin/rem/rem0cmp.c +++ b/storage/innodb_plugin/rem/rem0cmp.c @@ -862,6 +862,10 @@ cmp_rec_rec_with_match( const ulint* offsets1,/*!< in: rec_get_offsets(rec1, index) */ const ulint* offsets2,/*!< in: rec_get_offsets(rec2, index) */ dict_index_t* index, /*!< in: data dictionary index */ + ibool nulls_unequal, + /* in: TRUE if this is for index statistics + cardinality estimation, and innodb_stats_method + is "nulls_unequal" or "nulls_ignored" */ ulint* matched_fields, /*!< in/out: number of already completely matched fields; when the function returns, contains the value the for current @@ -961,9 +965,13 @@ cmp_rec_rec_with_match( || rec2_f_len == UNIV_SQL_NULL) { if (rec1_f_len == rec2_f_len) { - - goto next_field; - + /* This is limited to stats collection, + cannot use it for regular search */ + if (nulls_unequal) { + ret = -1; + } else { + goto next_field; + } } else if (rec2_f_len == UNIV_SQL_NULL) { /* We define the SQL null to be the diff --git a/storage/innodb_plugin/srv/srv0srv.c b/storage/innodb_plugin/srv/srv0srv.c index f7e7e351bdc..3cf17f33c40 100644 --- a/storage/innodb_plugin/srv/srv0srv.c +++ b/storage/innodb_plugin/srv/srv0srv.c @@ -243,6 +243,11 @@ UNIV_INTERN ulong srv_max_buf_pool_modified_pct = 75; /* variable counts amount of data read in total (in bytes) */ UNIV_INTERN ulint srv_data_read = 0; +/* Internal setting for "innodb_stats_method". Decides how InnoDB treats +NULL value when collecting statistics. By default, it is set to +SRV_STATS_NULLS_EQUAL(0), ie. all NULL value are treated equal */ +ulong srv_innodb_stats_method = SRV_STATS_NULLS_EQUAL; + /* here we count the amount of data written in total (in bytes) */ UNIV_INTERN ulint srv_data_written = 0; From 1f3975b4f8b22eef97b2d86b8ecbc17c90c5f1ea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marko=20M=C3=A4kel=C3=A4?= Date: Mon, 17 Jan 2011 14:06:48 +0200 Subject: [PATCH 04/15] Non-functional changes. Remove the unused data type dict_cluster_t. Remove a bogus comment about latching order. --- storage/innobase/include/dict0types.h | 5 ----- storage/innobase/include/trx0rseg.h | 4 +--- storage/innodb_plugin/include/dict0types.h | 5 ----- storage/innodb_plugin/include/trx0rseg.h | 4 +--- 4 files changed, 2 insertions(+), 16 deletions(-) diff --git a/storage/innobase/include/dict0types.h b/storage/innobase/include/dict0types.h index b90545f2105..6674b5ff397 100644 --- a/storage/innobase/include/dict0types.h +++ b/storage/innobase/include/dict0types.h @@ -16,11 +16,6 @@ typedef struct dict_index_struct dict_index_t; typedef struct dict_table_struct dict_table_t; typedef struct dict_foreign_struct dict_foreign_t; -/* A cluster object is a table object with the type field set to -DICT_CLUSTERED */ - -typedef dict_table_t dict_cluster_t; - typedef struct ind_node_struct ind_node_t; typedef struct tab_node_struct tab_node_t; diff --git a/storage/innobase/include/trx0rseg.h b/storage/innobase/include/trx0rseg.h index 46ba010bd1d..22f8aa89181 100644 --- a/storage/innobase/include/trx0rseg.h +++ b/storage/innobase/include/trx0rseg.h @@ -121,9 +121,7 @@ struct trx_rseg_struct{ ulint id; /* rollback segment id == the index of its slot in the trx system file copy */ mutex_t mutex; /* mutex protecting the fields in this - struct except id; NOTE that the latching - order must always be kernel mutex -> - rseg mutex */ + struct except id, which is constant */ ulint space; /* space where the rollback segment is header is placed */ ulint page_no;/* page number of the rollback segment diff --git a/storage/innodb_plugin/include/dict0types.h b/storage/innodb_plugin/include/dict0types.h index 7ad69193cc9..f14b59a19d4 100644 --- a/storage/innodb_plugin/include/dict0types.h +++ b/storage/innodb_plugin/include/dict0types.h @@ -33,11 +33,6 @@ typedef struct dict_index_struct dict_index_t; typedef struct dict_table_struct dict_table_t; typedef struct dict_foreign_struct dict_foreign_t; -/* A cluster object is a table object with the type field set to -DICT_CLUSTERED */ - -typedef dict_table_t dict_cluster_t; - typedef struct ind_node_struct ind_node_t; typedef struct tab_node_struct tab_node_t; diff --git a/storage/innodb_plugin/include/trx0rseg.h b/storage/innodb_plugin/include/trx0rseg.h index a25d84f1e84..e3674089735 100644 --- a/storage/innodb_plugin/include/trx0rseg.h +++ b/storage/innodb_plugin/include/trx0rseg.h @@ -135,9 +135,7 @@ struct trx_rseg_struct{ ulint id; /*!< rollback segment id == the index of its slot in the trx system file copy */ mutex_t mutex; /*!< mutex protecting the fields in this - struct except id; NOTE that the latching - order must always be kernel mutex -> - rseg mutex */ + struct except id, which is constant */ ulint space; /*!< space where the rollback segment is header is placed */ ulint zip_size;/* compressed page size of space From 359bddbee1a27864a38195e85fceab8a1678081d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marko=20M=C3=A4kel=C3=A4?= Date: Tue, 18 Jan 2011 12:25:13 +0200 Subject: [PATCH 05/15] Bug#59579 rw_lock_debug_print outputs to stderr rw_lock_debug_print(): Add parameter FILE* for specifying the output stream. rw_lock_list_print_info(): Invoke rw_lock_debug_print() on file, not stderr. --- storage/innobase/include/sync0rw.h | 3 ++- storage/innobase/sync/sync0arr.c | 4 ++-- storage/innobase/sync/sync0rw.c | 19 ++++++++++--------- storage/innodb_plugin/ChangeLog | 6 ++++++ storage/innodb_plugin/include/sync0rw.h | 3 ++- storage/innodb_plugin/sync/sync0arr.c | 4 ++-- storage/innodb_plugin/sync/sync0rw.c | 19 ++++++++++--------- 7 files changed, 34 insertions(+), 24 deletions(-) diff --git a/storage/innobase/include/sync0rw.h b/storage/innobase/include/sync0rw.h index 008df80a2c7..dd898557d6e 100644 --- a/storage/innobase/include/sync0rw.h +++ b/storage/innobase/include/sync0rw.h @@ -1,7 +1,7 @@ /****************************************************** The read-write lock (for threads, not for database transactions) -(c) 1995 Innobase Oy +Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved. Created 9/11/1995 Heikki Tuuri *******************************************************/ @@ -409,6 +409,7 @@ Prints info of a debug struct. */ void rw_lock_debug_print( /*================*/ + FILE* f, /* in: output stream */ rw_lock_debug_t* info); /* in: debug struct */ #endif /* UNIV_SYNC_DEBUG */ diff --git a/storage/innobase/sync/sync0arr.c b/storage/innobase/sync/sync0arr.c index 154593a9035..41d3492c8c9 100644 --- a/storage/innobase/sync/sync0arr.c +++ b/storage/innobase/sync/sync0arr.c @@ -1,7 +1,7 @@ /****************************************************** The wait array used in synchronization primitives -(c) 1995 Innobase Oy +Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved. Created 9/5/1995 Heikki Tuuri *******************************************************/ @@ -709,7 +709,7 @@ print: fprintf(stderr, "rw-lock %p ", (void*) lock); sync_array_cell_print(stderr, cell); - rw_lock_debug_print(debug); + rw_lock_debug_print(stderr, debug); return(TRUE); } } diff --git a/storage/innobase/sync/sync0rw.c b/storage/innobase/sync/sync0rw.c index 0b05fb826ac..ef4c07e8c26 100644 --- a/storage/innobase/sync/sync0rw.c +++ b/storage/innobase/sync/sync0rw.c @@ -1,7 +1,7 @@ /****************************************************** The read-write lock (for thread synchronization) -(c) 1995 Innobase Oy +Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved. Created 9/11/1995 Heikki Tuuri *******************************************************/ @@ -830,7 +830,7 @@ rw_lock_list_print_info( info = UT_LIST_GET_FIRST(lock->debug_list); while (info != NULL) { - rw_lock_debug_print(info); + rw_lock_debug_print(file, info); info = UT_LIST_GET_NEXT(list, info); } } @@ -870,7 +870,7 @@ rw_lock_print( info = UT_LIST_GET_FIRST(lock->debug_list); while (info != NULL) { - rw_lock_debug_print(info); + rw_lock_debug_print(stderr, info); info = UT_LIST_GET_NEXT(list, info); } } @@ -882,28 +882,29 @@ Prints info of a debug struct. */ void rw_lock_debug_print( /*================*/ + FILE* f, /* in: output stream */ rw_lock_debug_t* info) /* in: debug struct */ { ulint rwt; rwt = info->lock_type; - fprintf(stderr, "Locked: thread %lu file %s line %lu ", + fprintf(f, "Locked: thread %lu file %s line %lu ", (ulong) os_thread_pf(info->thread_id), info->file_name, (ulong) info->line); if (rwt == RW_LOCK_SHARED) { - fputs("S-LOCK", stderr); + fputs("S-LOCK", f); } else if (rwt == RW_LOCK_EX) { - fputs("X-LOCK", stderr); + fputs("X-LOCK", f); } else if (rwt == RW_LOCK_WAIT_EX) { - fputs("WAIT X-LOCK", stderr); + fputs("WAIT X-LOCK", f); } else { ut_error; } if (info->pass != 0) { - fprintf(stderr, " pass value %lu", (ulong) info->pass); + fprintf(f, " pass value %lu", (ulong) info->pass); } - putc('\n', stderr); + putc('\n', f); } /******************************************************************* diff --git a/storage/innodb_plugin/ChangeLog b/storage/innodb_plugin/ChangeLog index 43ffa762ddb..4d35bcff4a1 100644 --- a/storage/innodb_plugin/ChangeLog +++ b/storage/innodb_plugin/ChangeLog @@ -1,3 +1,9 @@ +2011-01-18 The InnoDB Team + + * include/sync0rw.h, sync/sync0arr.c, sync/sync0rw.c: + Fix Bug#59579 rw_lock_debug_print outputs to stderr, not to + SHOW ENGINE INNODB STATUS + 2011-01-14 The InnoDB Team * btr/btr0cur.c, dict/dict0dict.c, handler/ha_innodb.cc, include/btr0cur.h, include/dict0mem.h, include/rem0cmp.h, diff --git a/storage/innodb_plugin/include/sync0rw.h b/storage/innodb_plugin/include/sync0rw.h index 175f3deb77c..47f7dbfe0eb 100644 --- a/storage/innodb_plugin/include/sync0rw.h +++ b/storage/innodb_plugin/include/sync0rw.h @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved. +Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2008, Google Inc. Portions of this file contain modifications contributed and copyrighted by @@ -490,6 +490,7 @@ UNIV_INTERN void rw_lock_debug_print( /*================*/ + FILE* f, /*!< in: output stream */ rw_lock_debug_t* info); /*!< in: debug struct */ #endif /* UNIV_SYNC_DEBUG */ diff --git a/storage/innodb_plugin/sync/sync0arr.c b/storage/innodb_plugin/sync/sync0arr.c index 3c825e2202b..ad29b90d344 100644 --- a/storage/innodb_plugin/sync/sync0arr.c +++ b/storage/innodb_plugin/sync/sync0arr.c @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved. +Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2008, Google Inc. Portions of this file contain modifications contributed and copyrighted by @@ -715,7 +715,7 @@ print: fprintf(stderr, "rw-lock %p ", (void*) lock); sync_array_cell_print(stderr, cell); - rw_lock_debug_print(debug); + rw_lock_debug_print(stderr, debug); return(TRUE); } } diff --git a/storage/innodb_plugin/sync/sync0rw.c b/storage/innodb_plugin/sync/sync0rw.c index 572c3690a7f..00e0324becd 100644 --- a/storage/innodb_plugin/sync/sync0rw.c +++ b/storage/innodb_plugin/sync/sync0rw.c @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved. +Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2008, Google Inc. Portions of this file contain modifications contributed and copyrighted by @@ -925,7 +925,7 @@ rw_lock_list_print_info( info = UT_LIST_GET_FIRST(lock->debug_list); while (info != NULL) { - rw_lock_debug_print(info); + rw_lock_debug_print(file, info); info = UT_LIST_GET_NEXT(list, info); } } @@ -973,7 +973,7 @@ rw_lock_print( info = UT_LIST_GET_FIRST(lock->debug_list); while (info != NULL) { - rw_lock_debug_print(info); + rw_lock_debug_print(stderr, info); info = UT_LIST_GET_NEXT(list, info); } } @@ -985,28 +985,29 @@ UNIV_INTERN void rw_lock_debug_print( /*================*/ + FILE* f, /*!< in: output stream */ rw_lock_debug_t* info) /*!< in: debug struct */ { ulint rwt; rwt = info->lock_type; - fprintf(stderr, "Locked: thread %lu file %s line %lu ", + fprintf(f, "Locked: thread %lu file %s line %lu ", (ulong) os_thread_pf(info->thread_id), info->file_name, (ulong) info->line); if (rwt == RW_LOCK_SHARED) { - fputs("S-LOCK", stderr); + fputs("S-LOCK", f); } else if (rwt == RW_LOCK_EX) { - fputs("X-LOCK", stderr); + fputs("X-LOCK", f); } else if (rwt == RW_LOCK_WAIT_EX) { - fputs("WAIT X-LOCK", stderr); + fputs("WAIT X-LOCK", f); } else { ut_error; } if (info->pass != 0) { - fprintf(stderr, " pass value %lu", (ulong) info->pass); + fprintf(f, " pass value %lu", (ulong) info->pass); } - putc('\n', stderr); + putc('\n', f); } /***************************************************************//** From 7f7e934f23cde651a0bc6575958e163d08806cfe Mon Sep 17 00:00:00 2001 From: Sunny Bains Date: Tue, 25 Jan 2011 18:25:36 +1100 Subject: [PATCH 06/15] Fix Bug #59683 :InnoDB latch deadlock detector/violation debug code is very slow There are two main pain points, one is lookup by thread id for sync_thread_t and the other is to do a lookup by latch or level in sync_thread_t::levels. Changed the sync_thread_t::levels lookup and reserve operation from O(N) to O(1). Pure lookups are still O(N), the main change for pure lookup is that we no longer need to search up to SYNC_THREAD_N_LEVELS but only up to the number of slots actually ever used ie. it is possible some were used in the past but are now on the free list. If the in_use count drops to 0 we reset the free list too. Overload the sync_level_t::level field to track the free list. If sync_thread_t::latch == NULL then sync_thread_t::level contains the ordinal value of the previous free entry. rb://580 Approved by Jimmy Yang. --- storage/innobase/sync/sync0sync.c | 365 +++++++++++++++++------------- 1 file changed, 203 insertions(+), 162 deletions(-) diff --git a/storage/innobase/sync/sync0sync.c b/storage/innobase/sync/sync0sync.c index 761cc4c805e..13d8ae9ed43 100644 --- a/storage/innobase/sync/sync0sync.c +++ b/storage/innobase/sync/sync0sync.c @@ -222,21 +222,40 @@ UNIV_INTERN mysql_pfs_key_t mutex_list_mutex_key; UNIV_INTERN ibool sync_order_checks_on = FALSE; #endif /* UNIV_SYNC_DEBUG */ -/** Mutexes or rw-locks held by a thread */ -struct sync_thread_struct{ - os_thread_id_t id; /*!< OS thread id */ - sync_level_t* levels; /*!< level array for this thread; if - this is NULL this slot is unused */ +/** Number of slots reserved for each OS thread in the sync level array */ +static const ulint SYNC_THREAD_N_LEVELS = 10000; + +typedef struct sync_arr_struct sync_arr_t; + +/** Array for tracking sync levels per thread. */ +struct sync_arr_struct { + ulint in_use; /*!< Number of active cells */ + ulint n_elems; /*!< Number of elements in the array */ + ulint max_elems; /*!< Maximum elements */ + ulint next_free; /*!< ULINT_UNDEFINED or index of next + free slot */ + sync_level_t* elems; /*!< Array elements */ }; -/** Number of slots reserved for each OS thread in the sync level array */ -#define SYNC_THREAD_N_LEVELS 10000 +/** Mutexes or rw-locks held by a thread */ +struct sync_thread_struct{ + os_thread_id_t id; /*!< OS thread id */ + sync_arr_t* levels; /*!< level array for this thread; if + this is NULL this slot is unused */ +}; /** An acquired mutex or rw-lock and its level in the latching order */ struct sync_level_struct{ - void* latch; /*!< pointer to a mutex or an rw-lock; NULL means that - the slot is empty */ - ulint level; /*!< level of the latch in the latching order */ + void* latch; /*!< pointer to a mutex or an + rw-lock; NULL means that + the slot is empty */ + ulint level; /*!< level of the latch in the + latching order. This field is + overloaded to serve as a node in a + linked list of free nodes too. When + latch == NULL then this will contain + the ordinal value of the next free + element */ }; /******************************************************************//** @@ -745,27 +764,28 @@ mutex_n_reserved(void) /*==================*/ { mutex_t* mutex; - ulint count = 0; + ulint count = 0; mutex_enter(&mutex_list_mutex); - mutex = UT_LIST_GET_FIRST(mutex_list); + for (mutex = UT_LIST_GET_FIRST(mutex_list); + mutex != NULL; + mutex = UT_LIST_GET_NEXT(list, mutex)) { - while (mutex != NULL) { if (mutex_get_lock_word(mutex) != 0) { count++; } - - mutex = UT_LIST_GET_NEXT(list, mutex); } mutex_exit(&mutex_list_mutex); ut_a(count >= 1); - return(count - 1); /* Subtract one, because this function itself - was holding one mutex (mutex_list_mutex) */ + /* Subtract one, because this function itself was holding + one mutex (mutex_list_mutex) */ + + return(count - 1); } /******************************************************************//** @@ -780,20 +800,6 @@ sync_all_freed(void) return(mutex_n_reserved() + rw_lock_n_locked() == 0); } -/******************************************************************//** -Gets the value in the nth slot in the thread level arrays. -@return pointer to thread slot */ -static -sync_thread_t* -sync_thread_level_arrays_get_nth( -/*=============================*/ - ulint n) /*!< in: slot number */ -{ - ut_ad(n < OS_THREAD_MAX_N); - - return(sync_thread_level_arrays + n); -} - /******************************************************************//** Looks for the thread slot for the calling thread. @return pointer to thread slot, NULL if not found */ @@ -803,15 +809,15 @@ sync_thread_level_arrays_find_slot(void) /*====================================*/ { - sync_thread_t* slot; - os_thread_id_t id; ulint i; + os_thread_id_t id; id = os_thread_get_curr_id(); for (i = 0; i < OS_THREAD_MAX_N; i++) { + sync_thread_t* slot; - slot = sync_thread_level_arrays_get_nth(i); + slot = &sync_thread_level_arrays[i]; if (slot->levels && os_thread_eq(slot->id, id)) { @@ -831,12 +837,12 @@ sync_thread_level_arrays_find_free(void) /*====================================*/ { - sync_thread_t* slot; ulint i; for (i = 0; i < OS_THREAD_MAX_N; i++) { + sync_thread_t* slot; - slot = sync_thread_level_arrays_get_nth(i); + slot = &sync_thread_level_arrays[i]; if (slot->levels == NULL) { @@ -848,19 +854,44 @@ sync_thread_level_arrays_find_free(void) } /******************************************************************//** -Gets the value in the nth slot in the thread level array. -@return pointer to level slot */ +Print warning. */ static -sync_level_t* -sync_thread_levels_get_nth( -/*=======================*/ - sync_level_t* arr, /*!< in: pointer to level array for an OS - thread */ - ulint n) /*!< in: slot number */ +void +sync_print_warning( +/*===============*/ + const sync_level_t* slot) /*!< in: slot for which to + print warning */ { - ut_ad(n < SYNC_THREAD_N_LEVELS); + mutex_t* mutex; - return(arr + n); + mutex = slot->latch; + + if (mutex->magic_n == MUTEX_MAGIC_N) { + fprintf(stderr, + "Mutex created at %s %lu\n", + mutex->cfile_name, (ulong) mutex->cline); + + if (mutex_get_lock_word(mutex) != 0) { + ulint line; + const char* file_name; + os_thread_id_t thread_id; + + mutex_get_debug_info( + mutex, &file_name, &line, &thread_id); + + fprintf(stderr, + "InnoDB: Locked mutex:" + " addr %p thread %ld file %s line %ld\n", + (void*) mutex, os_thread_pf(thread_id), + file_name, (ulong) line); + } else { + fputs("Not locked\n", stderr); + } + } else { + rw_lock_t* lock = slot->latch; + + rw_lock_print(lock); + } } /******************************************************************//** @@ -871,69 +902,29 @@ static ibool sync_thread_levels_g( /*=================*/ - sync_level_t* arr, /*!< in: pointer to level array for an OS + sync_arr_t* arr, /*!< in: pointer to level array for an OS thread */ ulint limit, /*!< in: level limit */ ulint warn) /*!< in: TRUE=display a diagnostic message */ { - sync_level_t* slot; - rw_lock_t* lock; - mutex_t* mutex; ulint i; - for (i = 0; i < SYNC_THREAD_N_LEVELS; i++) { + for (i = 0; i < arr->n_elems; i++) { + const sync_level_t* slot; - slot = sync_thread_levels_get_nth(arr, i); - - if (slot->latch != NULL) { - if (slot->level <= limit) { - - if (!warn) { - - return(FALSE); - } - - lock = slot->latch; - mutex = slot->latch; + slot = &arr->elems[i]; + if (slot->latch != NULL && slot->level <= limit) { + if (warn) { fprintf(stderr, "InnoDB: sync levels should be" " > %lu but a level is %lu\n", (ulong) limit, (ulong) slot->level); - if (mutex->magic_n == MUTEX_MAGIC_N) { - fprintf(stderr, - "Mutex created at %s %lu\n", - mutex->cfile_name, - (ulong) mutex->cline); - - if (mutex_get_lock_word(mutex) != 0) { - const char* file_name; - ulint line; - os_thread_id_t thread_id; - - mutex_get_debug_info( - mutex, &file_name, - &line, &thread_id); - - fprintf(stderr, - "InnoDB: Locked mutex:" - " addr %p thread %ld" - " file %s line %ld\n", - (void*) mutex, - os_thread_pf( - thread_id), - file_name, - (ulong) line); - } else { - fputs("Not locked\n", stderr); - } - } else { - rw_lock_print(lock); - } - - return(FALSE); + sync_print_warning(slot); } + + return(FALSE); } } @@ -942,31 +933,29 @@ sync_thread_levels_g( /******************************************************************//** Checks if the level value is stored in the level array. -@return TRUE if stored */ +@return slot if found or NULL */ static -ibool +const sync_level_t* sync_thread_levels_contain( /*=======================*/ - sync_level_t* arr, /*!< in: pointer to level array for an OS + sync_arr_t* arr, /*!< in: pointer to level array for an OS thread */ ulint level) /*!< in: level */ { - sync_level_t* slot; ulint i; - for (i = 0; i < SYNC_THREAD_N_LEVELS; i++) { + for (i = 0; i < arr->n_elems; i++) { + const sync_level_t* slot; - slot = sync_thread_levels_get_nth(arr, i); + slot = &arr->elems[i]; - if (slot->latch != NULL) { - if (slot->level == level) { + if (slot->latch != NULL && slot->level == level) { - return(TRUE); - } + return(slot); } } - return(FALSE); + return(NULL); } /******************************************************************//** @@ -980,10 +969,9 @@ sync_thread_levels_contains( ulint level) /*!< in: latching order level (SYNC_DICT, ...)*/ { - sync_level_t* arr; - sync_thread_t* thread_slot; - sync_level_t* slot; ulint i; + sync_arr_t* arr; + sync_thread_t* thread_slot; if (!sync_order_checks_on) { @@ -1003,9 +991,10 @@ sync_thread_levels_contains( arr = thread_slot->levels; - for (i = 0; i < SYNC_THREAD_N_LEVELS; i++) { + for (i = 0; i < arr->n_elems; i++) { + sync_level_t* slot; - slot = sync_thread_levels_get_nth(arr, i); + slot = &arr->elems[i]; if (slot->latch != NULL && slot->level == level) { @@ -1031,10 +1020,9 @@ sync_thread_levels_nonempty_gen( also purge_is_running mutex is allowed */ { - sync_level_t* arr; - sync_thread_t* thread_slot; - sync_level_t* slot; ulint i; + sync_arr_t* arr; + sync_thread_t* thread_slot; if (!sync_order_checks_on) { @@ -1054,9 +1042,10 @@ sync_thread_levels_nonempty_gen( arr = thread_slot->levels; - for (i = 0; i < SYNC_THREAD_N_LEVELS; i++) { + for (i = 0; i < arr->n_elems; ++i) { + const sync_level_t* slot; - slot = sync_thread_levels_get_nth(arr, i); + slot = &arr->elems[i]; if (slot->latch != NULL && (!dict_mutex_allowed @@ -1098,10 +1087,10 @@ sync_thread_add_level( ulint level) /*!< in: level in the latching order; if SYNC_LEVEL_VARYING, nothing is done */ { - sync_level_t* array; - sync_level_t* slot; - sync_thread_t* thread_slot; ulint i; + sync_level_t* slot; + sync_arr_t* array; + sync_thread_t* thread_slot; if (!sync_order_checks_on) { @@ -1126,20 +1115,23 @@ sync_thread_add_level( thread_slot = sync_thread_level_arrays_find_slot(); if (thread_slot == NULL) { + ulint sz; + + sz = sizeof(*array) + + (sizeof(*array->elems) * SYNC_THREAD_N_LEVELS); + /* We have to allocate the level array for a new thread */ - array = ut_malloc(sizeof(sync_level_t) * SYNC_THREAD_N_LEVELS); + array = calloc(sz, sizeof(char)); + ut_a(array != NULL); + + array->next_free = ULINT_UNDEFINED; + array->max_elems = SYNC_THREAD_N_LEVELS; + array->elems = (sync_level_t*) &array[1]; thread_slot = sync_thread_level_arrays_find_free(); - thread_slot->id = os_thread_get_curr_id(); thread_slot->levels = array; - - for (i = 0; i < SYNC_THREAD_N_LEVELS; i++) { - - slot = sync_thread_levels_get_nth(array, i); - - slot->latch = NULL; - } + thread_slot->id = os_thread_get_curr_id(); } array = thread_slot->levels; @@ -1303,19 +1295,26 @@ sync_thread_add_level( ut_error; } - for (i = 0; i < SYNC_THREAD_N_LEVELS; i++) { + if (array->next_free == ULINT_UNDEFINED) { + ut_a(array->n_elems < array->max_elems); - slot = sync_thread_levels_get_nth(array, i); - - if (slot->latch == NULL) { - slot->latch = latch; - slot->level = level; - - break; - } + i = array->n_elems++; + } else { + i = array->next_free; + array->next_free = array->elems[i].level; } - ut_a(i < SYNC_THREAD_N_LEVELS); + ut_a(i < array->n_elems); + ut_a(i != ULINT_UNDEFINED); + + ++array->in_use; + + slot = &array->elems[i]; + + ut_a(slot->latch == NULL); + + slot->latch = latch; + slot->level = level; mutex_exit(&sync_thread_mutex); } @@ -1331,8 +1330,7 @@ sync_thread_reset_level( /*====================*/ void* latch) /*!< in: pointer to a mutex or an rw-lock */ { - sync_level_t* array; - sync_level_t* slot; + sync_arr_t* array; sync_thread_t* thread_slot; ulint i; @@ -1363,17 +1361,37 @@ sync_thread_reset_level( array = thread_slot->levels; - for (i = 0; i < SYNC_THREAD_N_LEVELS; i++) { + for (i = 0; i < array->n_elems; i++) { + sync_level_t* slot; - slot = sync_thread_levels_get_nth(array, i); + slot = &array->elems[i]; - if (slot->latch == latch) { - slot->latch = NULL; - - mutex_exit(&sync_thread_mutex); - - return(TRUE); + if (slot->latch != latch) { + continue; } + + slot->latch = NULL; + + /* Update the free slot list. See comment in sync_level_t + for the level field. */ + slot->level = array->next_free; + array->next_free = i; + + ut_a(array->in_use >= 1); + --array->in_use; + + /* If all cells are idle then reset the free + list. The assumption is that this will save + time when we need to scan up to n_elems. */ + + if (array->in_use == 0) { + array->n_elems = 0; + array->next_free = ULINT_UNDEFINED; + } + + mutex_exit(&sync_thread_mutex); + + return(TRUE); } if (((mutex_t*) latch)->magic_n != MUTEX_MAGIC_N) { @@ -1403,11 +1421,6 @@ void sync_init(void) /*===========*/ { -#ifdef UNIV_SYNC_DEBUG - sync_thread_t* thread_slot; - ulint i; -#endif /* UNIV_SYNC_DEBUG */ - ut_a(sync_initialized == FALSE); sync_initialized = TRUE; @@ -1421,13 +1434,10 @@ sync_init(void) /* Create the thread latch level array where the latch levels are stored for each OS thread */ - sync_thread_level_arrays = ut_malloc(OS_THREAD_MAX_N - * sizeof(sync_thread_t)); - for (i = 0; i < OS_THREAD_MAX_N; i++) { + sync_thread_level_arrays = calloc( + sizeof(sync_thread_t), OS_THREAD_MAX_N); + ut_a(sync_thread_level_arrays != NULL); - thread_slot = sync_thread_level_arrays_get_nth(i); - thread_slot->levels = NULL; - } #endif /* UNIV_SYNC_DEBUG */ /* Init the mutex list and create the mutex to protect it. */ @@ -1454,6 +1464,34 @@ sync_init(void) #endif /* UNIV_SYNC_DEBUG */ } +#ifdef UNIV_SYNC_DEBUG +/******************************************************************//** +Frees all debug memory. */ +static +void +sync_thread_level_arrays_free(void) +/*===============================*/ + +{ + ulint i; + + for (i = 0; i < OS_THREAD_MAX_N; i++) { + sync_thread_t* slot; + + slot = &sync_thread_level_arrays[i]; + + /* If this slot was allocated then free the slot memory too. */ + if (slot->levels != NULL) { + free(slot->levels); + slot->levels = NULL; + } + } + + free(sync_thread_level_arrays); + sync_thread_level_arrays = NULL; +} +#endif /* UNIV_SYNC_DEBUG */ + /******************************************************************//** Frees the resources in InnoDB's own synchronization data structures. Use os_sync_free() after calling this. */ @@ -1466,17 +1504,18 @@ sync_close(void) sync_array_free(sync_primary_wait_array); - mutex = UT_LIST_GET_FIRST(mutex_list); + for (mutex = UT_LIST_GET_FIRST(mutex_list); + mutex != NULL; + mutex = UT_LIST_GET_FIRST(mutex_list)) { - while (mutex) { #ifdef UNIV_MEM_DEBUG if (mutex == &mem_hash_mutex) { mutex = UT_LIST_GET_NEXT(list, mutex); continue; } #endif /* UNIV_MEM_DEBUG */ + mutex_free(mutex); - mutex = UT_LIST_GET_FIRST(mutex_list); } mutex_free(&mutex_list_mutex); @@ -1485,6 +1524,8 @@ sync_close(void) /* Switch latching order checks on in sync0sync.c */ sync_order_checks_on = FALSE; + + sync_thread_level_arrays_free(); #endif /* UNIV_SYNC_DEBUG */ sync_initialized = FALSE; From 60a622d1c1940f80829a4df312ff49a6feae265e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marko=20M=C3=A4kel=C3=A4?= Date: Tue, 25 Jan 2011 09:56:18 +0200 Subject: [PATCH 07/15] Bug#59707 Unused compression-related parameters in buffer pool functions buf_block_alloc(): ulint zip_size is always 0. buf_LRU_get_free_block(): ulint zip_size is always 0. buf_LRU_free_block(): ibool* buf_pool_mutex_released is always NULL. Remove these parameters. buf_LRU_get_free_block(): Simplify the initialization of block->page.zip and release buf_pool_mutex() earlier. --- storage/innodb_plugin/ChangeLog | 9 +++++ storage/innodb_plugin/btr/btr0btr.c | 2 +- storage/innodb_plugin/btr/btr0cur.c | 5 ++- storage/innodb_plugin/btr/btr0sea.c | 2 +- storage/innodb_plugin/buf/buf0buddy.c | 2 +- storage/innodb_plugin/buf/buf0buf.c | 14 ++++---- storage/innodb_plugin/buf/buf0lru.c | 44 +++++------------------- storage/innodb_plugin/include/buf0buf.h | 6 ++-- storage/innodb_plugin/include/buf0buf.ic | 8 ++--- storage/innodb_plugin/include/buf0lru.h | 14 +++----- storage/innodb_plugin/mem/mem0mem.c | 2 +- storage/innodb_plugin/page/page0zip.c | 2 +- 12 files changed, 40 insertions(+), 70 deletions(-) diff --git a/storage/innodb_plugin/ChangeLog b/storage/innodb_plugin/ChangeLog index 4d35bcff4a1..24cac7ac2be 100644 --- a/storage/innodb_plugin/ChangeLog +++ b/storage/innodb_plugin/ChangeLog @@ -1,3 +1,12 @@ +2011-01-25 The InnoDB Team + + * btr/btr0btr.c, btr/btr0cur.c, btr/btr0sea.c, + buf/buf0buddy.c, buf/buf0buf.c, buf/buf0lru.c, + include/buf0buf.h, include/buf0buf.ic, include/buf0lru.h, + mem/mem0mem.c, page/page0zip.c: + Fix Bug#59707 Unused compression-related parameters + in buffer pool functions + 2011-01-18 The InnoDB Team * include/sync0rw.h, sync/sync0arr.c, sync/sync0rw.c: diff --git a/storage/innodb_plugin/btr/btr0btr.c b/storage/innodb_plugin/btr/btr0btr.c index 32e2caecdb8..3d8d6048603 100644 --- a/storage/innodb_plugin/btr/btr0btr.c +++ b/storage/innodb_plugin/btr/btr0btr.c @@ -979,7 +979,7 @@ btr_page_reorganize_low( log_mode = mtr_set_log_mode(mtr, MTR_LOG_NONE); #ifndef UNIV_HOTBACKUP - temp_block = buf_block_alloc(0); + temp_block = buf_block_alloc(); #else /* !UNIV_HOTBACKUP */ ut_ad(block == back_block1); temp_block = back_block2; diff --git a/storage/innodb_plugin/btr/btr0cur.c b/storage/innodb_plugin/btr/btr0cur.c index 1fb0bc39933..f41b125b281 100644 --- a/storage/innodb_plugin/btr/btr0cur.c +++ b/storage/innodb_plugin/btr/btr0cur.c @@ -3767,13 +3767,12 @@ btr_blob_free( && buf_block_get_space(block) == space && buf_block_get_page_no(block) == page_no) { - if (buf_LRU_free_block(&block->page, all, NULL) - != BUF_LRU_FREED + if (buf_LRU_free_block(&block->page, all) != BUF_LRU_FREED && all && block->page.zip.data) { /* Attempt to deallocate the uncompressed page if the whole block cannot be deallocted. */ - buf_LRU_free_block(&block->page, FALSE, NULL); + buf_LRU_free_block(&block->page, FALSE); } } diff --git a/storage/innodb_plugin/btr/btr0sea.c b/storage/innodb_plugin/btr/btr0sea.c index 035fdbb61d2..9835efcf712 100644 --- a/storage/innodb_plugin/btr/btr0sea.c +++ b/storage/innodb_plugin/btr/btr0sea.c @@ -141,7 +141,7 @@ btr_search_check_free_space_in_heap(void) be enough free space in the hash table. */ if (heap->free_block == NULL) { - buf_block_t* block = buf_block_alloc(0); + buf_block_t* block = buf_block_alloc(); rw_lock_x_lock(&btr_search_latch); diff --git a/storage/innodb_plugin/buf/buf0buddy.c b/storage/innodb_plugin/buf/buf0buddy.c index ee5a569c3ff..63c99571510 100644 --- a/storage/innodb_plugin/buf/buf0buddy.c +++ b/storage/innodb_plugin/buf/buf0buddy.c @@ -327,7 +327,7 @@ buf_buddy_alloc_low( /* Try replacing an uncompressed page in the buffer pool. */ buf_pool_mutex_exit(); - block = buf_LRU_get_free_block(0); + block = buf_LRU_get_free_block(); *lru = TRUE; buf_pool_mutex_enter(); diff --git a/storage/innodb_plugin/buf/buf0buf.c b/storage/innodb_plugin/buf/buf0buf.c index dac416f9472..6e76e4c65be 100644 --- a/storage/innodb_plugin/buf/buf0buf.c +++ b/storage/innodb_plugin/buf/buf0buf.c @@ -1283,7 +1283,7 @@ shrink_again: buf_LRU_make_block_old(&block->page); dirty++; - } else if (buf_LRU_free_block(&block->page, TRUE, NULL) + } else if (buf_LRU_free_block(&block->page, TRUE) != BUF_LRU_FREED) { nonfree++; } @@ -1729,8 +1729,7 @@ err_exit: mutex_enter(block_mutex); /* Discard the uncompressed page frame if possible. */ - if (buf_LRU_free_block(bpage, FALSE, NULL) - == BUF_LRU_FREED) { + if (buf_LRU_free_block(bpage, FALSE) == BUF_LRU_FREED) { mutex_exit(block_mutex); goto lookup; @@ -2165,7 +2164,7 @@ wait_until_unfixed: buf_pool_mutex_exit(); mutex_exit(&buf_pool_zip_mutex); - block = buf_LRU_get_free_block(0); + block = buf_LRU_get_free_block(); ut_a(block); buf_pool_mutex_enter(); @@ -2291,8 +2290,7 @@ wait_until_unfixed: /* Try to evict the block from the buffer pool, to use the insert buffer as much as possible. */ - if (buf_LRU_free_block(&block->page, TRUE, NULL) - == BUF_LRU_FREED) { + if (buf_LRU_free_block(&block->page, TRUE) == BUF_LRU_FREED) { buf_pool_mutex_exit(); mutex_exit(&block->mutex); fprintf(stderr, @@ -2829,7 +2827,7 @@ buf_page_init_for_read( && UNIV_LIKELY(!recv_recovery_is_on())) { block = NULL; } else { - block = buf_LRU_get_free_block(0); + block = buf_LRU_get_free_block(); ut_ad(block); } @@ -3001,7 +2999,7 @@ buf_page_create( ut_ad(mtr->state == MTR_ACTIVE); ut_ad(space || !zip_size); - free_block = buf_LRU_get_free_block(0); + free_block = buf_LRU_get_free_block(); buf_pool_mutex_enter(); diff --git a/storage/innodb_plugin/buf/buf0lru.c b/storage/innodb_plugin/buf/buf0lru.c index e4cf218bf2e..39feb06ff23 100644 --- a/storage/innodb_plugin/buf/buf0lru.c +++ b/storage/innodb_plugin/buf/buf0lru.c @@ -575,7 +575,7 @@ buf_LRU_free_from_unzip_LRU_list( ut_ad(block->page.in_LRU_list); mutex_enter(&block->mutex); - freed = buf_LRU_free_block(&block->page, FALSE, NULL); + freed = buf_LRU_free_block(&block->page, FALSE); mutex_exit(&block->mutex); switch (freed) { @@ -636,7 +636,7 @@ buf_LRU_free_from_common_LRU_list( mutex_enter(block_mutex); accessed = buf_page_is_accessed(bpage); - freed = buf_LRU_free_block(bpage, TRUE, NULL); + freed = buf_LRU_free_block(bpage, TRUE); mutex_exit(block_mutex); switch (freed) { @@ -798,10 +798,8 @@ LRU list to the free list. @return the free control block, in state BUF_BLOCK_READY_FOR_USE */ UNIV_INTERN buf_block_t* -buf_LRU_get_free_block( -/*===================*/ - ulint zip_size) /*!< in: compressed page size in bytes, - or 0 if uncompressed tablespace */ +buf_LRU_get_free_block(void) +/*========================*/ { buf_block_t* block = NULL; ibool freed; @@ -877,26 +875,10 @@ loop: /* If there is a block in the free list, take it */ block = buf_LRU_get_free_only(); + buf_pool_mutex_exit(); + if (block) { - -#ifdef UNIV_DEBUG - block->page.zip.m_start = -#endif /* UNIV_DEBUG */ - block->page.zip.m_end = - block->page.zip.m_nonempty = - block->page.zip.n_blobs = 0; - - if (UNIV_UNLIKELY(zip_size)) { - ibool lru; - page_zip_set_size(&block->page.zip, zip_size); - block->page.zip.data = buf_buddy_alloc(zip_size, &lru); - UNIV_MEM_DESC(block->page.zip.data, zip_size, block); - } else { - page_zip_set_size(&block->page.zip, 0); - block->page.zip.data = NULL; - } - - buf_pool_mutex_exit(); + memset(&block->page.zip, 0, sizeof block->page.zip); if (started_monitor) { srv_print_innodb_monitor = mon_value_was; @@ -908,8 +890,6 @@ loop: /* If no block was in the free list, search from the end of the LRU list and try to free a block there */ - buf_pool_mutex_exit(); - freed = buf_LRU_search_and_free_block(n_iterations); if (freed > 0) { @@ -1378,12 +1358,8 @@ enum buf_lru_free_block_status buf_LRU_free_block( /*===============*/ buf_page_t* bpage, /*!< in: block to be freed */ - ibool zip, /*!< in: TRUE if should remove also the + ibool zip) /*!< in: TRUE if should remove also the compressed page of an uncompressed page */ - ibool* buf_pool_mutex_released) - /*!< in: pointer to a variable that will - be assigned TRUE if buf_pool_mutex - was temporarily released, or NULL */ { buf_page_t* b = NULL; mutex_t* block_mutex = buf_page_get_mutex(bpage); @@ -1554,10 +1530,6 @@ alloc: b->io_fix = BUF_IO_READ; } - if (buf_pool_mutex_released) { - *buf_pool_mutex_released = TRUE; - } - buf_pool_mutex_exit(); mutex_exit(block_mutex); diff --git a/storage/innodb_plugin/include/buf0buf.h b/storage/innodb_plugin/include/buf0buf.h index cd4ee5906f0..d903b443920 100644 --- a/storage/innodb_plugin/include/buf0buf.h +++ b/storage/innodb_plugin/include/buf0buf.h @@ -165,10 +165,8 @@ Allocates a buffer block. @return own: the allocated block, in state BUF_BLOCK_MEMORY */ UNIV_INLINE buf_block_t* -buf_block_alloc( -/*============*/ - ulint zip_size); /*!< in: compressed page size in bytes, - or 0 if uncompressed tablespace */ +buf_block_alloc(void); +/*=================*/ /********************************************************************//** Frees a buffer block which does not contain a file page. */ UNIV_INLINE diff --git a/storage/innodb_plugin/include/buf0buf.ic b/storage/innodb_plugin/include/buf0buf.ic index 23db684806c..0025bef5aac 100644 --- a/storage/innodb_plugin/include/buf0buf.ic +++ b/storage/innodb_plugin/include/buf0buf.ic @@ -719,14 +719,12 @@ Allocates a buffer block. @return own: the allocated block, in state BUF_BLOCK_MEMORY */ UNIV_INLINE buf_block_t* -buf_block_alloc( -/*============*/ - ulint zip_size) /*!< in: compressed page size in bytes, - or 0 if uncompressed tablespace */ +buf_block_alloc(void) +/*=================*/ { buf_block_t* block; - block = buf_LRU_get_free_block(zip_size); + block = buf_LRU_get_free_block(); buf_block_set_state(block, BUF_BLOCK_MEMORY); diff --git a/storage/innodb_plugin/include/buf0lru.h b/storage/innodb_plugin/include/buf0lru.h index 5a9cfd059f3..d543bce53cd 100644 --- a/storage/innodb_plugin/include/buf0lru.h +++ b/storage/innodb_plugin/include/buf0lru.h @@ -110,12 +110,9 @@ enum buf_lru_free_block_status buf_LRU_free_block( /*===============*/ buf_page_t* bpage, /*!< in: block to be freed */ - ibool zip, /*!< in: TRUE if should remove also the + ibool zip) /*!< in: TRUE if should remove also the compressed page of an uncompressed page */ - ibool* buf_pool_mutex_released); - /*!< in: pointer to a variable that will - be assigned TRUE if buf_pool_mutex - was temporarily released, or NULL */ + __attribute__((nonnull)); /******************************************************************//** Try to free a replaceable block. @return TRUE if found and freed */ @@ -146,10 +143,9 @@ LRU list to the free list. @return the free control block, in state BUF_BLOCK_READY_FOR_USE */ UNIV_INTERN buf_block_t* -buf_LRU_get_free_block( -/*===================*/ - ulint zip_size); /*!< in: compressed page size in bytes, - or 0 if uncompressed tablespace */ +buf_LRU_get_free_block(void) +/*========================*/ + __attribute__((warn_unused_result)); /******************************************************************//** Puts a block back to the free list. */ diff --git a/storage/innodb_plugin/mem/mem0mem.c b/storage/innodb_plugin/mem/mem0mem.c index 1dd4db30841..86100b04fd6 100644 --- a/storage/innodb_plugin/mem/mem0mem.c +++ b/storage/innodb_plugin/mem/mem0mem.c @@ -347,7 +347,7 @@ mem_heap_create_block( return(NULL); } } else { - buf_block = buf_block_alloc(0); + buf_block = buf_block_alloc(); } block = (mem_block_t*) buf_block->frame; diff --git a/storage/innodb_plugin/page/page0zip.c b/storage/innodb_plugin/page/page0zip.c index d3b1edefc6b..bb9b0995c72 100644 --- a/storage/innodb_plugin/page/page0zip.c +++ b/storage/innodb_plugin/page/page0zip.c @@ -4439,7 +4439,7 @@ page_zip_reorganize( log_mode = mtr_set_log_mode(mtr, MTR_LOG_NONE); #ifndef UNIV_HOTBACKUP - temp_block = buf_block_alloc(0); + temp_block = buf_block_alloc(); btr_search_drop_page_hash_index(block); block->check_index_page_at_flush = TRUE; #else /* !UNIV_HOTBACKUP */ From 46b7ef69916635ca0575ea4898ed8980f4bf6f5a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marko=20M=C3=A4kel=C3=A4?= Date: Tue, 25 Jan 2011 11:54:50 +0200 Subject: [PATCH 08/15] Bug#59464 Race condition in row_vers_build_for_semi_consistent_read row_vers_build_for_semi_consistent_read(): Dereference version_trx before releasing kernel_mutex, but not thereafter. --- storage/innobase/row/row0vers.c | 10 +++++++--- storage/innodb_plugin/ChangeLog | 5 +++++ storage/innodb_plugin/row/row0vers.c | 10 +++++++--- 3 files changed, 19 insertions(+), 6 deletions(-) diff --git a/storage/innobase/row/row0vers.c b/storage/innobase/row/row0vers.c index f4adfa855df..23aca8c3f2e 100644 --- a/storage/innobase/row/row0vers.c +++ b/storage/innobase/row/row0vers.c @@ -593,11 +593,15 @@ row_vers_build_for_semi_consistent_read( mutex_enter(&kernel_mutex); version_trx = trx_get_on_id(version_trx_id); + if (version_trx + && (version_trx->conc_state == TRX_COMMITTED_IN_MEMORY + || version_trx->conc_state == TRX_NOT_STARTED)) { + + version_trx = NULL; + } mutex_exit(&kernel_mutex); - if (!version_trx - || version_trx->conc_state == TRX_NOT_STARTED - || version_trx->conc_state == TRX_COMMITTED_IN_MEMORY) { + if (!version_trx) { /* We found a version that belongs to a committed transaction: return it. */ diff --git a/storage/innodb_plugin/ChangeLog b/storage/innodb_plugin/ChangeLog index 24cac7ac2be..d5e9a6bc825 100644 --- a/storage/innodb_plugin/ChangeLog +++ b/storage/innodb_plugin/ChangeLog @@ -1,3 +1,8 @@ +2011-01-25 The InnoDB Team + + * row/row0vers.c: + Fix Bug#59464 Race condition in row_vers_build_for_semi_consistent_read + 2011-01-25 The InnoDB Team * btr/btr0btr.c, btr/btr0cur.c, btr/btr0sea.c, diff --git a/storage/innodb_plugin/row/row0vers.c b/storage/innodb_plugin/row/row0vers.c index b6d35363f08..d4fde0b939b 100644 --- a/storage/innodb_plugin/row/row0vers.c +++ b/storage/innodb_plugin/row/row0vers.c @@ -669,11 +669,15 @@ row_vers_build_for_semi_consistent_read( mutex_enter(&kernel_mutex); version_trx = trx_get_on_id(version_trx_id); + if (version_trx + && (version_trx->conc_state == TRX_COMMITTED_IN_MEMORY + || version_trx->conc_state == TRX_NOT_STARTED)) { + + version_trx = NULL; + } mutex_exit(&kernel_mutex); - if (!version_trx - || version_trx->conc_state == TRX_NOT_STARTED - || version_trx->conc_state == TRX_COMMITTED_IN_MEMORY) { + if (!version_trx) { /* We found a version that belongs to a committed transaction: return it. */ From 896e0ba4e0304fbd1b056022d8e27f6ce146a83e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marko=20M=C3=A4kel=C3=A4?= Date: Tue, 25 Jan 2011 12:17:28 +0200 Subject: [PATCH 09/15] Bug#59486 Incorrect usage of UNIV_UNLIKELY() in mlog_parse_string() mlog_parse_string(): Enclose the comparison in UNIV_UNLIKELY, not the comparand. --- storage/innodb_plugin/ChangeLog | 5 +++++ storage/innodb_plugin/mtr/mtr0log.c | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/storage/innodb_plugin/ChangeLog b/storage/innodb_plugin/ChangeLog index d5e9a6bc825..cac72fd3075 100644 --- a/storage/innodb_plugin/ChangeLog +++ b/storage/innodb_plugin/ChangeLog @@ -1,3 +1,8 @@ +2011-01-25 The InnoDB Team + + * mtr/mtr0log.c: + Bug#59486 Incorrect usage of UNIV_UNLIKELY() in mlog_parse_string() + 2011-01-25 The InnoDB Team * row/row0vers.c: diff --git a/storage/innodb_plugin/mtr/mtr0log.c b/storage/innodb_plugin/mtr/mtr0log.c index 3f3dab36b76..3349036b5b3 100644 --- a/storage/innodb_plugin/mtr/mtr0log.c +++ b/storage/innodb_plugin/mtr/mtr0log.c @@ -408,7 +408,7 @@ mlog_parse_string( ptr += 2; if (UNIV_UNLIKELY(offset >= UNIV_PAGE_SIZE) - || UNIV_UNLIKELY(len + offset) > UNIV_PAGE_SIZE) { + || UNIV_UNLIKELY(len + offset > UNIV_PAGE_SIZE)) { recv_sys->found_corrupt_log = TRUE; return(NULL); From e44703db76e87fccbcc2e51606f04b18b55a0544 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marko=20M=C3=A4kel=C3=A4?= Date: Tue, 25 Jan 2011 15:43:08 +0200 Subject: [PATCH 10/15] Bug#59585 Fix 58912 introduces compiler warning due to potentially uninitialized variable row_upd_changes_ord_field_binary(): Initialize dfield_len to suppress the warning. The compiler cannot know that row_ext_lookup() does initialize dfield_len for us, as it is defined in a different module. --- storage/innodb_plugin/ChangeLog | 6 ++++++ storage/innodb_plugin/row/row0upd.c | 4 ++++ 2 files changed, 10 insertions(+) diff --git a/storage/innodb_plugin/ChangeLog b/storage/innodb_plugin/ChangeLog index cac72fd3075..e2fdeecfcc1 100644 --- a/storage/innodb_plugin/ChangeLog +++ b/storage/innodb_plugin/ChangeLog @@ -1,3 +1,9 @@ +2011-01-25 The InnoDB Team + + * row/row0upd.c: + Bug#59585 Fix 58912 introduces compiler warning + due to potentially uninitialized variable + 2011-01-25 The InnoDB Team * mtr/mtr0log.c: diff --git a/storage/innodb_plugin/row/row0upd.c b/storage/innodb_plugin/row/row0upd.c index 4aa1474a25b..691d263e6ed 100644 --- a/storage/innodb_plugin/row/row0upd.c +++ b/storage/innodb_plugin/row/row0upd.c @@ -1252,6 +1252,10 @@ row_upd_changes_ord_field_binary( || dfield_is_null(dfield)) { /* do nothing special */ } else if (UNIV_LIKELY_NULL(ext)) { + /* Silence a compiler warning without + silencing a Valgrind error. */ + dfield_len = 0; + UNIV_MEM_INVALID(&dfield_len, sizeof dfield_len); /* See if the column is stored externally. */ buf = row_ext_lookup(ext, col_no, &dfield_len); From 31790d7f6a1855e93d23495add7a8c873b21629c Mon Sep 17 00:00:00 2001 From: Sunny Bains Date: Wed, 26 Jan 2011 09:33:59 +1100 Subject: [PATCH 11/15] In sync_close() fix a bug introduced by the fix for Bug #59683 where we iterate over the mutex list and free each mutex. When UNIV_MEM_DEBUG is defined, we need skip the hash mutex. It is a minor bug affecting only UNIV_SYNC_DEBUG builds, found by Michael. --- storage/innobase/sync/sync0sync.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/storage/innobase/sync/sync0sync.c b/storage/innobase/sync/sync0sync.c index 13d8ae9ed43..453314f465d 100644 --- a/storage/innobase/sync/sync0sync.c +++ b/storage/innobase/sync/sync0sync.c @@ -1506,7 +1506,7 @@ sync_close(void) for (mutex = UT_LIST_GET_FIRST(mutex_list); mutex != NULL; - mutex = UT_LIST_GET_FIRST(mutex_list)) { + /* No op */) { #ifdef UNIV_MEM_DEBUG if (mutex == &mem_hash_mutex) { @@ -1516,6 +1516,8 @@ sync_close(void) #endif /* UNIV_MEM_DEBUG */ mutex_free(mutex); + + mutex = UT_LIST_GET_FIRST(mutex_list); } mutex_free(&mutex_list_mutex); From 35aba604a0e3d9a33128cce49195026c4c31df87 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marko=20M=C3=A4kel=C3=A4?= Date: Thu, 27 Jan 2011 10:12:00 +0200 Subject: [PATCH 12/15] ibuf_contract_ext(): Remove an #if 0 section of code that refers to trx_sys_set_ibuf_format(). Change buffer format tagging was never implemented. --- storage/innobase/ibuf/ibuf0ibuf.c | 19 +------------------ 1 file changed, 1 insertion(+), 18 deletions(-) diff --git a/storage/innobase/ibuf/ibuf0ibuf.c b/storage/innobase/ibuf/ibuf0ibuf.c index 0cec0318bf4..e1f61d48a76 100644 --- a/storage/innobase/ibuf/ibuf0ibuf.c +++ b/storage/innobase/ibuf/ibuf0ibuf.c @@ -2585,23 +2585,6 @@ ibuf_contract_ext( if (UNIV_UNLIKELY(ibuf->empty) && UNIV_LIKELY(!srv_shutdown_state)) { -ibuf_is_empty: - -#if 0 /* TODO */ - if (srv_shutdown_state) { - /* If the insert buffer becomes empty during - shutdown, note it in the system tablespace. */ - - trx_sys_set_ibuf_format(TRX_SYS_IBUF_EMPTY); - } - - /* TO DO: call trx_sys_set_ibuf_format() at startup - and whenever ibuf_use is changed to allow buffered - delete-marking or deleting. Never downgrade the - stamped format except when the insert buffer becomes - empty. */ -#endif - return(0); } @@ -2631,7 +2614,7 @@ ibuf_is_empty: mtr_commit(&mtr); btr_pcur_close(&pcur); - goto ibuf_is_empty; + return(0); } sum_sizes = ibuf_get_merge_page_nos(TRUE, btr_pcur_get_rec(&pcur), From 786ac62c82038ed42278b3699b0661f0bb3c80ac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marko=20M=C3=A4kel=C3=A4?= Date: Thu, 27 Jan 2011 13:27:29 +0200 Subject: [PATCH 13/15] Bug#59440 Race condition in XA ROLLBACK and XA COMMIT after server restart trx_get_trx_by_xid(): Invalidate trx->xid after a successful lookup, so that subsequent callers will not find the same transaction. The only callers of trx_get_trx_by_xid() will be invoking innobase_commit_low() or innobase_rollback_trx(), and those code paths should not depend on trx->xid. rb://584 approved by Jimmy Yang --- storage/innobase/include/trx0trx.h | 5 +++-- storage/innobase/trx/trx0trx.c | 26 ++++++++++++------------- storage/innodb_plugin/ChangeLog | 6 ++++++ storage/innodb_plugin/include/trx0trx.h | 4 ++-- storage/innodb_plugin/trx/trx0trx.c | 25 +++++++++++------------- 5 files changed, 34 insertions(+), 32 deletions(-) diff --git a/storage/innobase/include/trx0trx.h b/storage/innobase/include/trx0trx.h index 97a47d9f46e..4652f45892e 100644 --- a/storage/innobase/include/trx0trx.h +++ b/storage/innobase/include/trx0trx.h @@ -198,8 +198,9 @@ which is in the prepared state */ trx_t * trx_get_trx_by_xid( /*===============*/ - /* out: trx or NULL */ - XID* xid); /* in: X/Open XA transaction identification */ + /* out: trx or NULL; + on match, the trx->xid will be invalidated */ + const XID* xid); /* in: X/Open XA transaction identifier */ /************************************************************************** If required, flushes the log to disk if we called trx_commit_for_mysql() with trx->flush_log_later == TRUE. */ diff --git a/storage/innobase/trx/trx0trx.c b/storage/innobase/trx/trx0trx.c index 21f75e0818f..a82d7f452fc 100644 --- a/storage/innobase/trx/trx0trx.c +++ b/storage/innobase/trx/trx0trx.c @@ -2041,14 +2041,15 @@ which is in the prepared state */ trx_t* trx_get_trx_by_xid( /*===============*/ - /* out: trx or NULL */ - XID* xid) /* in: X/Open XA transaction identification */ + /* out: trx or NULL; + on match, the trx->xid will be invalidated */ + const XID* xid) /* in: X/Open XA transaction identifier */ { trx_t* trx; if (xid == NULL) { - return (NULL); + return(NULL); } mutex_enter(&kernel_mutex); @@ -2061,10 +2062,16 @@ trx_get_trx_by_xid( of gtrid_lenght+bqual_length bytes should be the same */ - if (xid->gtrid_length == trx->xid.gtrid_length + if (trx->conc_state == TRX_PREPARED + && xid->gtrid_length == trx->xid.gtrid_length && xid->bqual_length == trx->xid.bqual_length && memcmp(xid->data, trx->xid.data, xid->gtrid_length + xid->bqual_length) == 0) { + + /* Invalidate the XID, so that subsequent calls + will not find it. */ + memset(&trx->xid, 0, sizeof(trx->xid)); + trx->xid.formatID = -1; break; } @@ -2073,14 +2080,5 @@ trx_get_trx_by_xid( mutex_exit(&kernel_mutex); - if (trx) { - if (trx->conc_state != TRX_PREPARED) { - - return(NULL); - } - - return(trx); - } else { - return(NULL); - } + return(trx); } diff --git a/storage/innodb_plugin/ChangeLog b/storage/innodb_plugin/ChangeLog index e2fdeecfcc1..3e14b0052e7 100644 --- a/storage/innodb_plugin/ChangeLog +++ b/storage/innodb_plugin/ChangeLog @@ -1,3 +1,9 @@ +2011-01-27 The InnoDB Team + + * include/trx0trx.h, trx/trx0trx.c: + Bug#59440 Race condition in XA ROLLBACK and XA COMMIT + after server restart + 2011-01-25 The InnoDB Team * row/row0upd.c: diff --git a/storage/innodb_plugin/include/trx0trx.h b/storage/innodb_plugin/include/trx0trx.h index abd175d365b..833bae4a4ff 100644 --- a/storage/innodb_plugin/include/trx0trx.h +++ b/storage/innodb_plugin/include/trx0trx.h @@ -214,12 +214,12 @@ trx_recover_for_mysql( /*******************************************************************//** This function is used to find one X/Open XA distributed transaction which is in the prepared state -@return trx or NULL */ +@return trx or NULL; on match, the trx->xid will be invalidated */ UNIV_INTERN trx_t * trx_get_trx_by_xid( /*===============*/ - XID* xid); /*!< in: X/Open XA transaction identification */ + const XID* xid); /*!< in: X/Open XA transaction identifier */ /**********************************************************************//** If required, flushes the log to disk if we called trx_commit_for_mysql() with trx->flush_log_later == TRUE. diff --git a/storage/innodb_plugin/trx/trx0trx.c b/storage/innodb_plugin/trx/trx0trx.c index ee744fd58b1..f0bbf220815 100644 --- a/storage/innodb_plugin/trx/trx0trx.c +++ b/storage/innodb_plugin/trx/trx0trx.c @@ -2010,18 +2010,18 @@ trx_recover_for_mysql( /*******************************************************************//** This function is used to find one X/Open XA distributed transaction which is in the prepared state -@return trx or NULL */ +@return trx or NULL; on match, the trx->xid will be invalidated */ UNIV_INTERN trx_t* trx_get_trx_by_xid( /*===============*/ - XID* xid) /*!< in: X/Open XA transaction identification */ + const XID* xid) /*!< in: X/Open XA transaction identifier */ { trx_t* trx; if (xid == NULL) { - return (NULL); + return(NULL); } mutex_enter(&kernel_mutex); @@ -2034,10 +2034,16 @@ trx_get_trx_by_xid( of gtrid_lenght+bqual_length bytes should be the same */ - if (xid->gtrid_length == trx->xid.gtrid_length + if (trx->conc_state == TRX_PREPARED + && xid->gtrid_length == trx->xid.gtrid_length && xid->bqual_length == trx->xid.bqual_length && memcmp(xid->data, trx->xid.data, xid->gtrid_length + xid->bqual_length) == 0) { + + /* Invalidate the XID, so that subsequent calls + will not find it. */ + memset(&trx->xid, 0, sizeof(trx->xid)); + trx->xid.formatID = -1; break; } @@ -2046,14 +2052,5 @@ trx_get_trx_by_xid( mutex_exit(&kernel_mutex); - if (trx) { - if (trx->conc_state != TRX_PREPARED) { - - return(NULL); - } - - return(trx); - } else { - return(NULL); - } + return(trx); } From 71e8043bae2071ba875b18326504b1058b8deb98 Mon Sep 17 00:00:00 2001 From: Jimmy Yang Date: Fri, 28 Jan 2011 00:50:10 -0800 Subject: [PATCH 14/15] Fix Bug #59465 btr_estimate_number_of_different_key_vals use incorrect offset for external_size rb://581 approved by Marko --- storage/innobase/btr/btr0cur.c | 10 +++++----- storage/innodb_plugin/ChangeLog | 7 +++++++ storage/innodb_plugin/btr/btr0cur.c | 10 +++++----- 3 files changed, 17 insertions(+), 10 deletions(-) diff --git a/storage/innobase/btr/btr0cur.c b/storage/innobase/btr/btr0cur.c index 9f4babfaae6..6c0497cbd41 100644 --- a/storage/innobase/btr/btr0cur.c +++ b/storage/innobase/btr/btr0cur.c @@ -2981,6 +2981,9 @@ btr_estimate_number_of_different_key_vals( while (!page_rec_is_supremum(rec)) { rec_t* next_rec = page_rec_get_next(rec); if (page_rec_is_supremum(next_rec)) { + total_external_size += + btr_rec_get_externally_stored_len( + rec, offsets_rec); break; } @@ -2988,7 +2991,8 @@ btr_estimate_number_of_different_key_vals( matched_bytes = 0; offsets_next_rec = rec_get_offsets(next_rec, index, offsets_next_rec, - n_cols, &heap); + ULINT_UNDEFINED, + &heap); cmp_rec_rec_with_match(rec, next_rec, offsets_rec, offsets_next_rec, @@ -3043,10 +3047,6 @@ btr_estimate_number_of_different_key_vals( } } - offsets_rec = rec_get_offsets(rec, index, offsets_rec, - ULINT_UNDEFINED, &heap); - total_external_size += btr_rec_get_externally_stored_len( - rec, offsets_rec); mtr_commit(&mtr); } diff --git a/storage/innodb_plugin/ChangeLog b/storage/innodb_plugin/ChangeLog index 3e14b0052e7..7a901fc1fa1 100644 --- a/storage/innodb_plugin/ChangeLog +++ b/storage/innodb_plugin/ChangeLog @@ -1,3 +1,9 @@ +2011-01-27 The InnoDB Team + + * btr/btr0cur.c: + Bug#59465 btr_estimate_number_of_different_key_vals use + incorrect offset for external_size + 2011-01-27 The InnoDB Team * include/trx0trx.h, trx/trx0trx.c: @@ -29,6 +35,7 @@ Fix Bug#59707 Unused compression-related parameters in buffer pool functions +>>>>>>> MERGE-SOURCE 2011-01-18 The InnoDB Team * include/sync0rw.h, sync/sync0arr.c, sync/sync0rw.c: diff --git a/storage/innodb_plugin/btr/btr0cur.c b/storage/innodb_plugin/btr/btr0cur.c index f41b125b281..874db3066b5 100644 --- a/storage/innodb_plugin/btr/btr0cur.c +++ b/storage/innodb_plugin/btr/btr0cur.c @@ -3365,6 +3365,9 @@ btr_estimate_number_of_different_key_vals( while (!page_rec_is_supremum(rec)) { rec_t* next_rec = page_rec_get_next(rec); if (page_rec_is_supremum(next_rec)) { + total_external_size += + btr_rec_get_externally_stored_len( + rec, offsets_rec); break; } @@ -3372,7 +3375,8 @@ btr_estimate_number_of_different_key_vals( matched_bytes = 0; offsets_next_rec = rec_get_offsets(next_rec, index, offsets_next_rec, - n_cols, &heap); + ULINT_UNDEFINED, + &heap); cmp_rec_rec_with_match(rec, next_rec, offsets_rec, offsets_next_rec, @@ -3427,10 +3431,6 @@ btr_estimate_number_of_different_key_vals( } } - offsets_rec = rec_get_offsets(rec, index, offsets_rec, - ULINT_UNDEFINED, &heap); - total_external_size += btr_rec_get_externally_stored_len( - rec, offsets_rec); mtr_commit(&mtr); } From b448b5d5106197cf9b13c3f10fedc4b187a83ccf Mon Sep 17 00:00:00 2001 From: Jimmy Yang Date: Fri, 28 Jan 2011 01:38:21 -0800 Subject: [PATCH 15/15] Fix Bug #59390 row_merge_build_indexes() fails to register tmpfd for PFS rb://582 approved by Marko --- storage/innobase/row/row0merge.c | 70 +++++++++++++++++++++----------- 1 file changed, 47 insertions(+), 23 deletions(-) diff --git a/storage/innobase/row/row0merge.c b/storage/innobase/row/row0merge.c index 0c68d6477a4..9827141caec 100644 --- a/storage/innobase/row/row0merge.c +++ b/storage/innobase/row/row0merge.c @@ -2158,13 +2158,15 @@ row_merge_drop_temp_indexes(void) } /*********************************************************************//** -Create a merge file. */ -static -void -row_merge_file_create( -/*==================*/ - merge_file_t* merge_file) /*!< out: merge file structure */ +Creates temperary merge files, and if UNIV_PFS_IO defined, register +the file descriptor with Performance Schema. +@return File descriptor */ +UNIV_INLINE +int +row_merge_file_create_low(void) +/*===========================*/ { + int fd; #ifdef UNIV_PFS_IO /* This temp file open does not go through normal file APIs, add instrumentation to register with @@ -2176,14 +2178,46 @@ row_merge_file_create( "Innodb Merge Temp File", __FILE__, __LINE__); #endif - merge_file->fd = innobase_mysql_tmpfile(); + fd = innobase_mysql_tmpfile(); +#ifdef UNIV_PFS_IO + register_pfs_file_open_end(locker, fd); +#endif + return(fd); +} +/*********************************************************************//** +Create a merge file. */ +static +void +row_merge_file_create( +/*==================*/ + merge_file_t* merge_file) /*!< out: merge file structure */ +{ + merge_file->fd = row_merge_file_create_low(); merge_file->offset = 0; merge_file->n_rec = 0; -#ifdef UNIV_PFS_IO - register_pfs_file_open_end(locker, merge_file->fd); -#endif } +/*********************************************************************//** +Destroy a merge file. And de-register the file from Performance Schema +if UNIV_PFS_IO is defined. */ +UNIV_INLINE +void +row_merge_file_destroy_low( +/*=======================*/ + int fd) /*!< in: merge file descriptor */ +{ +#ifdef UNIV_PFS_IO + struct PSI_file_locker* locker = NULL; + PSI_file_locker_state state; + register_pfs_file_io_begin(&state, locker, + fd, 0, PSI_FILE_CLOSE, + __FILE__, __LINE__); +#endif + close(fd); +#ifdef UNIV_PFS_IO + register_pfs_file_io_end(locker, 0); +#endif +} /*********************************************************************//** Destroy a merge file. */ static @@ -2192,20 +2226,10 @@ row_merge_file_destroy( /*===================*/ merge_file_t* merge_file) /*!< out: merge file structure */ { -#ifdef UNIV_PFS_IO - struct PSI_file_locker* locker = NULL; - PSI_file_locker_state state; - register_pfs_file_io_begin(&state, locker, merge_file->fd, 0, PSI_FILE_CLOSE, - __FILE__, __LINE__); -#endif if (merge_file->fd != -1) { - close(merge_file->fd); + row_merge_file_destroy_low(merge_file->fd); merge_file->fd = -1; } - -#ifdef UNIV_PFS_IO - register_pfs_file_io_end(locker, 0); -#endif } /*********************************************************************//** @@ -2600,7 +2624,7 @@ row_merge_build_indexes( row_merge_file_create(&merge_files[i]); } - tmpfd = innobase_mysql_tmpfile(); + tmpfd = row_merge_file_create_low(); /* Reset the MySQL row buffer that is used when reporting duplicate keys. */ @@ -2642,7 +2666,7 @@ row_merge_build_indexes( } func_exit: - close(tmpfd); + row_merge_file_destroy_low(tmpfd); for (i = 0; i < n_indexes; i++) { row_merge_file_destroy(&merge_files[i]);